Date: (Fri) Jun 19, 2015
Data: Source: Training: hygiene.dat -> hygiene.txt with “review” in header New:
Time period:
Based on analysis utilizing <> techniques,
Regression results: First run:
Classification results: First run:
First run: All.X.no.rnorm.rf: Leaderboard: 0.525 newobs_tbl=[0=9, 1=191]; submit_filename=template_Final_rf_submit_06_18_01 OOB_conf_mtrx=[4, 102]=106; max.Accuracy.OOB<=0.5454545; opt.prob.threshold.OOB=0.3 R.nchrs.log=100.00; R.npnct13.log=93.59; R.ratio.nstopwrds.nwrds=89.96 R.T.like=84.81; R.T.food=82.79; R.T.just=82.70
After Acronyms scrubbing: All.X.no.rnorm.rf: Leaderboard: 0.545 newobs_tbl=[0=27, 1=173]; submit_filename=template_Final_rf_submit_06_19_01 OOB_conf_mtrx=[7, 93]=100; max.Accuracy.OOB=0.5454545; opt.prob.threshold.OOB=0.3 R.ratio.nstopwrds.nwrds=63.63; R.sum.TfIdf=62.10; R.npnct19.log=46.68 R.T.like=100.00; R.T.noodl=87.23; R.T.just=79.47
Select Models w/ Interactions: -> Discarded Interact.High.cor.Y.glm: Leaderboard: 0.53 newobs_tbl=[0=80, 1=120]; submit_filename=template_Final_glm_submit_06_19_02 OOB_conf_mtrx=[19, 66]=85; max.Accuracy.OOB=0.6136364; opt.prob.threshold.OOB=0.4 R.npnct19.log=100.00; R.npnct19.log:R.nwrds.log=82.73; R.npnct19.log:R.npnct07.log=65.87; R.ratio.nstopwrds.nwrds=; R.sum.TfIdf=; R.T.dish=82.38; R.npnct19.log:R.T.sum=33.26; R.npnct19.log:R.T.vietnames=22.66; R.T.like=; R.T.noodl=; R.T.just=
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
rm(list=ls())
set.seed(12345)
options(stringsAsFactors=FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
registerDoMC(4) # max(length(glb_txt_vars), glb_n_cv_folds) + 1
#packageVersion("snow")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
glb_trnng_url <- "hygiene.txt"
glb_newdt_url <- "<newdt_url>"
glb_out_pfx <- "template_"
glb_save_envir <- FALSE # or TRUE
glb_is_separate_newobs_dataset <- FALSE # or TRUE
glb_split_entity_newobs_datasets <- TRUE # or FALSE
glb_split_newdata_method <- "condition" # "condition" or "sample" or "copy"
glb_split_newdata_condition <- "is.na(dirty)" # or "is.na(<var>)"; "<var> <condition_operator> <value>"
glb_split_newdata_size_ratio <- 0.3 # > 0 & < 1
glb_split_sample.seed <- 123 # or any integer
glb_max_fitobs <- NULL # or any integer
glb_is_regression <- FALSE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- TRUE #or FALSE
glb_rsp_var_raw <- "dirty"
# for classification, the response variable has to be a factor
glb_rsp_var <- "dirty.fctr"
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- function(raw) {
ret_vals <- rep_len(NA, length(raw)); ret_vals[!is.na(raw)] <- ifelse(raw[!is.na(raw)] == 1, "Y", "N"); return(relevel(as.factor(ret_vals), ref="N"))
# #as.factor(paste0("B", raw))
# #as.factor(gsub(" ", "\\.", raw))
}
glb_map_rsp_raw_to_var(c(1, 0, 1, 0, NA))
## [1] Y N Y N <NA>
## Levels: N Y
glb_map_rsp_var_to_raw <- function(var) {
as.numeric(var) - 1
# #as.numeric(var)
# #gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
# #c(FALSE, TRUE)[as.numeric(var)]
}
glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(c(1, 0, 1, 0, NA)))
## [1] 1 0 1 0 NA
if ((glb_rsp_var != glb_rsp_var_raw) & is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
glb_rsp_var_out <- paste0(glb_rsp_var, ".predict.") # model_id is appended later
# List info gathered for various columns
# <col_name>: <description>; <notes>
# review: user review in hygiene.dat
# dirty: health inspector classification in hygiene.dat.labels ([none] is NA)
# If multiple vars are parts of id, consider concatenating them to create one id var
# If glb_id_var == NULL, ".rownames <- row.names()" is the default
glb_id_var <- NULL # or c("<var1>")
glb_category_vars <- NULL # or c("<var1>", "<var2>")
glb_drop_vars <- c(NULL) # or c("<col_name>")
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
glb_assign_pairs_lst <- NULL;
# glb_assign_pairs_lst[["<var1>"]] <- list(from=c(NA),
# to=c("NA.my"))
glb_assign_vars <- names(glb_assign_pairs_lst)
glb_transform_lst <- NULL;
glb_transform_lst[["review"]] <- list(
# mapfn=function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# , sfx=".my.fctr")
mapfn=function(raw) { mod_raw <- raw;
mod_raw <- gsub("&#[[:digit:]]{3};", " ", mod_raw);
# Modifications for this exercise only
mod_raw <- gsub("MindyP.S.", "Mindy P.S.", mod_raw);
mod_raw <- gsub("\\b[Ii]nternational [Dd]istrict\\b", "InternationalDistrict", mod_raw);
mod_raw <- gsub("\\bI\\.D\\.([ ,\\!])", "InternationalDistrict\\1", mod_raw);
mod_raw <- gsub("\\bI\\.D\\.$", "InternationalDistrict", mod_raw);
mod_raw <- gsub("\\bJ\\.P\\. Patches", "JPPatches", mod_raw);
mod_raw <- gsub("\\bR\\.B\\. ", "RanchoBravo ", mod_raw);
mod_raw <- gsub("\\bS\\.O\\.S([ \\.])", "shitonashingle\\1", mod_raw);
mod_raw <- gsub("\\bshit on a shingle", "shitonashingle", mod_raw);
mod_raw <- gsub("\\bT\\.T\\.(\\B)", "Tamarind Tree\\1", mod_raw);
mod_raw <- gsub("\\bT\\.V\\.HUGE", "T.V. HUGE", mod_raw);
mod_raw <- gsub("\\bU\\.W\\. ", "UniversityOfWashington ", mod_raw);
return(mod_raw)
}
, sfx=".my")
# raw <- paste(unlist(strsplit(glb_trnobs_df[1, "review"], split=""))[2000:3000], collapse=""); print(raw)
# mapfn(glb_allobs_df$review[739])
# mapfn(glb_allobs_df$review[740])
# ret_lst <- regexec("&#[[:digit:]]{3};", raw, ignore.case=FALSE); ret_lst <- regmatches(raw, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
# ret_lst <- gregexpr("&#[[:digit:]]{3};", raw, ignore.case=FALSE); print(ret_lst); print(length(ret_lst[[1]]))
# mapfn(glb_allobs_df$review)
# mapfn(glb_allobs_df$review[739:740])
# mapfn(glb_allobs_df$<var>)
# glb_transform_lst[["<var1>"]] <- glb_transform_lst[["<var2>"]]
# Add logs of numerics that are not distributed normally -> do automatically ???
glb_transform_vars <- names(glb_transform_lst)
glb_date_vars <- NULL # or c("<date_var>")
glb_date_fmts <- list(); #glb_date_fmts[["<date_var>"]] <- "%m/%e/%y"
glb_date_tzs <- list(); #glb_date_tzs[["<date_var>"]] <- "America/New_York"
#grep("America/New", OlsonNames(), value=TRUE)
glb_txt_vars <- c("review.my")
#Sys.setlocale("LC_ALL", "C") # For english
glb_append_stop_words <- list()
# Remember to use unstemmed words
#orderBy(~ -cor.y.abs, subset(glb_feats_df, grepl("[HSA]\\.T\\.", id) & !is.na(cor.high.X)))
#dsp_obs(Headline.contains="polit")
#subset(glb_allobs_df, H.T.compani > 0)[, c("UniqueID", "Headline", "H.T.compani")]
# glb_append_stop_words[["<txt_var1>"]] <- c(NULL
# # ,"<word1>" # <reason1>
# )
#subset(glb_allobs_df, S.T.newyorktim > 0)[, c("UniqueID", "Snippet", "S.T.newyorktim")]
#glb_txt_lst[["Snippet"]][which(glb_allobs_df$UniqueID %in% c(8394, 8317, 8339, 8350, 8307))]
glb_important_terms <- list()
# Remember to use stemmed terms
#glb_sprs_thresholds <- c(0.99) # Generates 3916 terms
#glb_sprs_thresholds <- c(0.95) # Generates 1422 terms
#glb_sprs_thresholds <- c(0.90) # Generates 828 terms
#glb_sprs_thresholds <- c(0.80) # Generates 453 terms
#glb_sprs_thresholds <- c(0.60) # Generates 146 terms
glb_sprs_thresholds <- c(0.30) # Generates 12 terms
# Properties:
# numrows(glb_feats_df) << numrows(glb_fitobs_df)
# Select terms that appear in at least 0.2 * O(FP/FN(glb_OOBobs_df))
# numrows(glb_OOBobs_df) = 1.1 * numrows(glb_newobs_df)
names(glb_sprs_thresholds) <- glb_txt_vars
# Derived features (consolidate this with transform features ???)
glb_derive_lst <- NULL;
# glb_derive_lst[["PTS.diff"]] <- list(
# mapfn=function(PTS, oppPTS) { return(PTS - oppPTS) }
# , args=c("PTS", "oppPTS"))
glb_derive_lst[["review.niso8859.log"]] <- list(
mapfn=function(review) { match_lst <- gregexpr("&#[[:digit:]]{3};", review)
match_num_vctr <- unlist(lapply(match_lst,
function(elem) length(elem)))
return(log(1 + match_num_vctr)) }
, args=c("review"))
# args_lst <- NULL; for (arg in glb_derive_lst[["PTS.diff"]]$args) args_lst[[arg]] <- glb_allobs_df[, arg]; do.call(mapfn, args_lst)
# glb_derive_lst[["<var1>"]] <- glb_derive_lst[["<var2>"]]
glb_derive_vars <- names(glb_derive_lst)
# User-specified exclusions
glb_exclude_vars_as_features <- c("review")
if (glb_rsp_var_raw != glb_rsp_var)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_rsp_var_raw)
# List feats that shd be excluded due to known causation by prediction variable
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c(NULL)) # or c("<col_name>")
glb_impute_na_data <- FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- FALSE # or TRUE
glb_interaction_only_features <- NULL # or ???
glb_models_lst <- list(); glb_models_df <- data.frame()
# Regression
if (glb_is_regression)
glb_models_method_vctr <- c("lm", "glm", "bayesglm", "rpart", "rf") else
# Classification
if (glb_is_binomial)
glb_models_method_vctr <- c("glm", "bayesglm", "rpart", "rf") else
glb_models_method_vctr <- c("rpart", "rf")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<col_name>")
glb_model_metric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glb_model_metric <- NULL # or "<metric_name>"
glb_model_metric_maximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glb_model_metric_smmry <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glb_model_metric_terms)
# metric <- sum(confusion_mtrx * glb_model_metric_terms) / nrow(data)
# names(metric) <- glb_model_metric
# return(metric)
# }
glb_tune_models_df <-
rbind(
#data.frame(parameter="cp", min=0.00005, max=0.00005, by=0.000005),
#seq(from=0.01, to=0.01, by=0.01)
#data.frame(parameter="mtry", min=080, max=100, by=10),
#data.frame(parameter="mtry", min=08, max=10, by=1),
data.frame(parameter="dummy", min=2, max=4, by=1)
)
# or NULL
glb_n_cv_folds <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glb_model_evl_criteria <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glb_model_evl_criteria <-
c("max.Accuracy.OOB", "max.auc.OOB", "max.Kappa.OOB", "min.aic.fit") else
glb_model_evl_criteria <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
glb_sel_mdl_id <- NULL # or "<model_id_prefix>.<model_method>"
glb_fin_mdl_id <- glb_sel_mdl_id # or "Final"
# Depict process
glb_analytics_pn <- petrinet(name="glb_analytics_pn",
trans_df=data.frame(id=1:6,
name=c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 11.169 NA NA
1.0: import data#glb_chunks_df <- myadd_chunk(NULL, "import.data")
# glb_trnobs_df <- myimport_data(url=glb_trnng_url, comment="glb_trnobs_df",
# force_header=TRUE)
# glb_trnobs_df <- read.delim("data/hygiene.txt", header=TRUE, fill=TRUE, sep="\t",
# fileEncoding='iso-8859-1')
glb_trnobs_df <- read.table("data/hygiene.dat.labels", col.names=c("dirty"),
na.strings="[none]")
glb_trnobs_df$review <- readLines("data/hygiene.dat", n =-1)
## Warning in readLines("data/hygiene.dat", n = -1): incomplete final line
## found on 'data/hygiene.dat'
comment(glb_trnobs_df) <- "glb_trnobs_df"
# glb_trnobs_df <- data.frame()
# for (symbol in c("Boeing", "CocaCola", "GE", "IBM", "ProcterGamble")) {
# sym_trnobs_df <-
# myimport_data(url=gsub("IBM", symbol, glb_trnng_url), comment="glb_trnobs_df",
# force_header=TRUE)
# sym_trnobs_df$Symbol <- symbol
# glb_trnobs_df <- myrbind_df(glb_trnobs_df, sym_trnobs_df)
# }
# glb_trnobs_df <-
# glb_trnobs_df %>% dplyr::filter(Year >= 1999)
if (glb_is_separate_newobs_dataset) {
glb_newobs_df <- myimport_data(url=glb_newdt_url, comment="glb_newobs_df",
force_header=TRUE)
# To make plots / stats / checks easier in chunk:inspectORexplore.data
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df);
comment(glb_allobs_df) <- "glb_allobs_df"
} else {
glb_allobs_df <- glb_trnobs_df; comment(glb_allobs_df) <- "glb_allobs_df"
if (!glb_split_entity_newobs_datasets) {
stop("Not implemented yet")
glb_newobs_df <- glb_trnobs_df[sample(1:nrow(glb_trnobs_df),
max(2, nrow(glb_trnobs_df) / 1000)),]
} else if (glb_split_newdata_method == "condition") {
glb_newobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=glb_split_newdata_condition)))
glb_trnobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=paste0("!(",
glb_split_newdata_condition,
")"))))
} else if (glb_split_newdata_method == "sample") {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=(1-glb_split_newdata_size_ratio))
glb_newobs_df <- glb_trnobs_df[!split, ]
glb_trnobs_df <- glb_trnobs_df[split ,]
} else if (glb_split_newdata_method == "copy") {
glb_trnobs_df <- glb_allobs_df
comment(glb_trnobs_df) <- "glb_trnobs_df"
glb_newobs_df <- glb_allobs_df
comment(glb_newobs_df) <- "glb_newobs_df"
} else stop("glb_split_newdata_method should be %in% c('condition', 'sample', 'copy')")
comment(glb_newobs_df) <- "glb_newobs_df"
myprint_df(glb_newobs_df)
str(glb_newobs_df)
if (glb_split_entity_newobs_datasets) {
myprint_df(glb_trnobs_df)
str(glb_trnobs_df)
}
}
## dirty
## 547 NA
## 548 NA
## 549 NA
## 550 NA
## 551 NA
## 552 NA
## review
## 547 Let me start off by saying that this place is located on the corner of Dexter and Hayes.  This little fact has been a source of great frustration between me and my friends for years.  Anyhow, this is your typical neighborhood bar.  Good food, good drink and good people.  After you've been here a couple of times you are pretty much known by the bartenders and by many of the other patrons... and if you bring your favorite mixed cd you'll most likely get it played at some point in time if you hand it to the person behind the counter.Most of the other reviews for this place are pretty spot on... pool table is temperamental, dart boards are a little worn out and the outdoor patio is a pretty good size.  Not too mention you really wouldn't even make a special trip out here unless you were once or still are a regular.  It's one of those places where you're driving around in the area and you stop in because it seems cool enough for a quick drink till you meet up with other friends at a different place.  But, once you're in there you'll probably want to come back because it is just cool enough to hangout and have a good time. Fantastic after work joint.  Sometimes I get a little tired of it, as going to the same place every week can do, but inevitably every Friday booze calls and I go tromping up the fat hill to have a few (the hill is very fat.  Just FYI in case you want to climb it.)  Lovely little place, nice people (although the guy was a little mean to me once.  That is once, however, and I've been there way more than once) and they have a surprisingly yummy hummus plate.And they haven't kicked us out yet, despite a couple of broken chairs, numerous complaints from their neighbors and a fence scaling incident.  Gotta love the love from neighborhood bars. It's usually not my first choice, but I always have a good time at Dexter & Hayes.  The pool table doesn't necessarily release all the balls every time, and one of the dart boards could use some help (bent wires and old cork can tend to "push" your darts out sometimes)... but for some reason this just lends to the overall "friend's basement" feel.  The staff is extremely attentive and you are never lacking for a drink. this is really a great little "community" pub. it really has that neighborhood sense and feel to it. its probably because most people would not (and i advise you not to) seek it out as a destination type spot. this isnt the kind of place that you would want to go out of your way to visit, its just not worth it for a simple beer. that being said, if you live in the westlake, northeast queen anne part of town this is a great after work spot. the happy hour deals for food (which is just your typical pub fare) and drinks are great and the atmosphere and the barstaff are really very friendly and outgoing. even though the place seems to be packed with "regulars" and people typically know each other from around the block, there is a very welcome attitude to everyone at this place. as far as decor goes, the bar has a very small front end with a couple televisions for sports (they play hockey!) and theres a nice garden patio out back and a small little rec room with a pool table and dart board down below. seriously, if youre in the neighborhood stop by during the late afternoons, its a very welcoming mix of both young and old. also, it gets pretty happening at night, especially on mondays when their resident piano man plays a set starting at 9pm. I came here for a friends birthday party.  They reserved they entire basement for a 200 dollar deposit.  So you can have the whole basement to yourself and if you don't burn the place down you get it for free.  This is awsome.  Anyway the drinks are good and I spent way too much so it must be a good bar.
## 548 Nice place.  Smallish, low key, comfortable decor, but also nice enough to dress up for if you like.  Not that we did...  If there's a wait, you can spend the time next door at the attached bar.  Interesting cocktails and fancy bar food while you wait.Once back in the restaurant, the menu of small, shareable plates covers a lot of ground.  Everything we tried was good, from the (not just good -- excellent) steak tartar to the rotisserie pork belly.  Service was not 100%, but was friendly and unobtrusive, and any errors were corrected in the right way.Two negative points, though:  One, what's the deal with Seattle cheese plates?  Every place I've had one, we're talking tiny little smidgens of cheese.  The markup must be astounding.  To Lark's credit, at least the cheeses were of nice style and quality, and not overripe (Hi again, Palace Kitchen).Two, the wine list was not impressive and doesn't match the value of the food (which is mostly reasonably priced).  We had two bottles of wine, both in the $30-40 range, and they were both disappointing to me (although they did put me in remarkably fine spirits!).  Granted, this is the "low end" for wine at many places, but it should be easy for Lark to find a few bottles in this range that would better match the quality of the food.  Given the $15 corkage fee, just about any $15 bottle of wine you'd care to bring will put you well ahead of the game. The best of Seattle's bird species named restaurants. Oh my god this place is great. Some many good things to try with the menu changing with seasonal items. I cannot wait to go back. After only two visits, Lark has become my favorite restaurant in Seattle for a nice night out, and without a doubt has the best food I've had since moving here.As an East Coast emigre, I admit to my confusion regarding "nice" restaurants in Seattle. I show up in my business-casual work clothes, and nobody's dressed better than I am: no jackets, no ties. People in t-shirts and polo shirts. I don't think it's wrong, but it makes me a little uneasy.Lark made it easier: the cozy rustic decor and extremely friendly staff brought about a very California sort of enforced casualness.I definitely found the menu complex to navigate the first time I arrived, but our waitress happily answered all of our questions without any disdain or frustration.And, oh yeah, the food? It's stellar. Small plates, meant to be shared, but don't get the cheese platter unless there are at least three or four of you; it gets in the way of trying other dishes.Start, perhaps, with the blood orange and endive salad. It's a visually stunning piece of work, with the deep red of the oranges on the outside of an endive pyramid. It's beautiful, and the sweetness of the oranges nicely balances the bitterness of the endive. The awesome presentation and complex flavors set the tone for the rest of Lark's food.Don't miss the rosti potatoes: a crispy, tasty potato pancake, like if your Jewish grandmother made latkes after studying nouvelle cuisine in Paris, or your morning short-order cook decided to make his hash browns a work of art.The fish and meat plates are spectacular: the pork rillettes are rich flavorful, the sea scallops are well-prepared, and the elk is great. The braised beef shortribs balance homey pot roast flavors with restaurant flair, managing to be both comforting and extravagant at the same time.Do save room for dessert: the frozen Meyer lemon parfait stands out as the choice to make. The other desserts fade from memory, but this sharp and fragrant concoction is unshakeable. Get at least one for your table, no matter how many other desserts you decide upon.All told, expect to order five or six small plates for two people, not including dessert. Add in a glass or two of wine each, and you're talking sixty-five to seventy-five dollars per person, after tax and tip.One last word: the menu is definitely carnivore-centric; dedicated vegetarians may be disappointed. Those tasty rosti potatoes are fried in duck fat. Some friends treated my wife and me to dinner at Lark and what a treat it was.  It was the first time all four of us had been there, so the waitress explained the portion sizes & family style presentation.  The warm, comfortable atmosphere is inviting, with exposed wooden rafters, and curtains hanging from a track that snakes around the room, allowing for the room to be open or to be sectioned off into private dining areas.We each ordered two items that looked good, and then they determined in what order to best serve our dishes, beginning with a selection of cheeses (we asked them to pick 3 that would match what we had ordered).  Sharing the choices of three other people, I expected there would be at least one dish I wouldn't be thrilled about, but everything was excellent!  The striped bass was buttery and had a nice firm texture, and I am not a big fan of fish. Fuji apple, raddichio and balsamic salad was sweet/savory delicious.  And the sausage & pork belly over white corn melted in my mouth.  They surprised us with a couple free desserts ("mistakes" from another table)... chocolate cake with coffee ice cream (again, not a fan of coffee, but this was so good I might have to reconsider) and apple tart with vanilla ice cream.Definitely treat yourself to an evening at Lark if you have the chance. Lark was by far my favorite meal in Seattle.  I love the intimate and sophisticated dining room, and the food was all absolutely delicious. i started with the  pork rillettes with fleur de sel butter and radishes, then the Corn Soup with poached duck egg and summer truffles, and finally squab with figs and a port sauce.  for dessert, i had a rhubarb cobbler.  My only dissapointment: i wish i could have tried much much more! next time i will have to bring someone with me. very good flavors, interesting and reliably tasty combonations of things like: Duck eggs,  wild mushrooms, olives, radicchio, prosciutto and manilla clams. Just ask your server for clarification of things foreign and otherwise unusual: (the following words straight from the menu) MOZZARELLA BURRATA, VENTRECHE, CLABBER CREAM, FARRO, SULTANA, LANDJAGAR, CARPACCIO, SABA, CORNICHONS, SWEEDBREADS, GNUDI, BLANQUETTE, and EMMER. Lark serves foods from local "sustainable" artisan farms, bakeries,  and butchers.  Everything is served in it's own, specially designated plate or vessel, often bare cast-iron (STAUB brand) dutch ovens.  The dessert is mighty fine and very welcome after a salty meal. Try the Lemon parfait.European to the core, (from the menu to the "W.C." on the bathroom door to the big bill at the end) lark pleases your small group or a hot date. Bring the airline miles visa card.
## 549 Everything I could want in a neighborhood pizza place. Solid east-coast-style pizzas, cold beer, and full tilt ice cream! Plus, video games, a kids play area (If you are a childophobe, you probably should get takeout, it's VERY kid-friendly inside)The Aloha pizza is your standard Hawaiian plus sun-dried tomatoes. It sounds a little weird, but they aren't super dried, and add a nice dimension.The Pritty Boys' Special is something out of my childhood. Pepperoni, Sausage, Mushrooms, and Black Olives. Yum!Great sauce, great flavor to the crust, though the oven could be a little hotter and crisp up the edges a bit more.I didn't have any problem with the service like other reviewers mentioned. Sure it was a little slow, as they were very busy, but everyone was super-nice and friendly. I really like the pizza but the service needs improvement. The food: The pizza has great sauce and the crust has a great combination of crunch and chew. Very nice. We have had a couple different salads and they have been quite good. Definitely much better than your typical pizza place fare.The service: Have been three times and I don't think I have seen a smile on the staff yet. I know the place is new so I give them a little slack, but it has definitely been disappointing. I also know they get busy but smiles and friendly attitudes can't take that much time can they?  A good service contrast in the neighborhood would be St. Clouds, been there many times and have always felt greeted warmly and served happily. It makes a big difference.So I will return but it might end up being more take out than eat in unless the service improves, which I am hopeful it will. I would love for this place to be a regular spot for us. Take out pizza tonight from our first neighborhood pizzeria and we're fans! Crispy crust and a bit of chew...I can honestly say as a New Jerseyite, this is as close to east coast pizza as I've discovered in Seattle.  We are so excited.  Mama's Pizza on Capital Hill is a close second for us.  The woman who took the order and provided the service was a bit flat, hoping she gets a bit more excited about the great pizza they are serving.  Seems like a super family friendly space.  The website is not yet up and running, but will likely up soon. Have some Molly Moons afterwards at the new MM spot a 1/2 block up.  Check it out! The pizza is pretty good, and there's a kid's play area, and those are the only two positives in this place.  The service is terrible however, as it seems like the waiter had never waited on anyone before - he took our drink and food order, came back with our drinks and then never showed up again.  We had to scour the place to find him to ask him for the bill. Ok they have some kinks to iron out, but they have some key things going for them that already makes them a way more pleasant place than the alehouse down the street (a place that should be awesome but isn't). The pizza is good, but the most pleasant surprise is that the salads are excellent. The price point is good for the pizza and the salads. The pasta is good but seems a little overpriced. The servers are very sweet and are clearly trying to do a good job which makes the place have a nice vibe even if they aren't always on top of it. They really doubled down on the kids area both in the main room for little kids and in the back where there is an xbox, wii and pinball machine. Its pretty cool but the downside is that it makes the layout of the tables kind of wonky. Maybe they should have consulted a feng shui expert or something, the bar doesnt quite feel like a bar yet with the icecream case on one side and boxes of bottled beer on the other. These things are all pretty minor and definitely fixable over time. If they keep working at it I can see this fitting into Madrona very nicely.
## 550 Mediocre food and decent service. Why don't they get a better restaurant in place of this place? I title this "Holy s&*, seriously, how dare you not comp the ENTIRE bill". Dedicated to Ella C.  Several years ago, in 2005 BY (BEFORE YELP), my sis& I drowned our post-visting our father / pre-morning coffee sorrows in a delicious mound of heavenly french toast stuffed I MEAN STUFFED french toast, oozing I MEAN OOZING w/ blueberry & ricotta cheese stuffing.  In fact, before my last visit to Julia's I had their stuffed french toast listed as my YELP WHAT IS YOUR LAST MEAL meal.  Well, I changed that listing toot sweet after last weekend's fateful experience ...Picture it.  Sunday morning.  A highly anticipated re-creation of the Sunday morning years ago where sis & I shared heavenly pillows of frenchy toasty goodness.  We even found a parking spot RIGHT in front!  Good start, right??  Oh how wrong wrong wrong we were.  There were 4 of us, so we decided to order the stuffed french toast as an appetizer if you will, for us to share.  Mind you, my sis &I have not only talked this french toast up to our breakfast companions Ella C. & Mom of Winston, but we have elevated its reputation to that of a cancer curer, a million dollar lottery winning, no traffic on the 405 EITHER DIRECTION at 5 o'clock (that's like the LA version of the Bay Bridge, for you 415 or 510-er's, or the Floating Bridge if you live in the 206).  We ask for the pre-requisite cups o' coffee (here I digress, how SHITTY is drip coffee in Seattle restaurants when most establishments make a latte that brings me to a euphoric state of nirvana - hey, Nirvana, Seattle, get it?!?!), &place our accompanying egg-based orders.  Here's where our story takes a gastronomical turn for the worse.  My sister & Ella C ask for soy sausage in their breakfast burritos.  I ask for the regular breakfast burritio, Winston's mom asks for a delightful sounding yogurt granola parfait & a side of bacon.  The waiter asks her how many slices-2, or 4.  Pay attention to her answer, it will reveal it's relevance later.  She answers 4-4 slices of bacon.  We order our french toast appetizer & our 'waiter' scurries off.  In what seems like HOURS later, our "waiter" (god, I just lol'd typing the word 'waiter' to refer to the EFFING JACKASS who somehow hijacked a real waiters uniform & played a cruel joke on us by imitating someone whose job it is to listen to us ask for food, then write down what we say, then walk it to a kitchen prep area) delivered plates of food to our table, not necessarily our food, but to be fair plates had food on them.  Exhibit A:  He delivered plate of the cinammon french toast, not the stuffed french toast as ordered.  Exhibit B:  After placing orders in front of us, he busted out his best waiter karaoke (thank you Bill Simmons) "did you need anything else right now"?  Um, our stuffed french toast & the side of bacon.  "How many slices?" he asked, "Um, 4, like I said".  He blinked & walked away.  Minga B took a bite of her burrito first (even though she & Ella B ordered same thing, Ella C is STRICT vegetarian & Minga B is SO NOT so Minga B is always the test-biter-into-er.  Minga B took a forkful of burrito & put it in her mouth.  Her face twisted into some sort of expression reserved for the ugliest of scary gargoyles - give me back my sister!!! - & said "oh my god, these eggs taste REALLY funny"... She took another bite to see if she could figure out why.  We all stopped to look at her as her confusion turned to visible revulsion at the answer provided by her taste buds:  "Oh my god you guys, these eggs have f&*%ing SOY SAUCE on them!!!".  THE LITERAL DUMB WAITER HEARD SOY SAUCE NOT SOY SAUSAGE!!  As my jaw drops in disbelief, I manage to close it around my forkful of my regular breakfast burrito.  WTF?!?  What's the funny taste I am now experiencing??  Well, it's PINEAPPLE!!!  FUNNY, I DON'T REMEMBER BREAKFAST BURRITOS COMING WITH PINEAPPLE EVER IN MY LIFE OF EATING EGGS WRAPPED IN TORTILLAS WHICH IS A LOT CONSIDERING I'M HALF MEXICAN!  As if soy sauce eggs weren't enough, there was pineapple in my burrito.  Winston's mom still hadn't received her 4 f&*%ing slices of hog back.  When the waiter finally returns to deliver the stuffed french toast & my friend's bacon, my sister looks at him, trying to be kind she says "um, I think you heard SOY SAUCE instead of SOY SAUSAUGE", he says "um, yeah, I thought that sounded strange".  "Why didn't you ask us then?" she said.  Blink.  He returned to kitchen to have poor chef re-make burritos leaving us w/ the most ANEMIC stuffed french toast i've ever seen.  Even the toast was embarrassed to call itself French wearing it's flimsy bluberry outfit.  We all stared at it & I could feel my friends thinking "those broads crapped themselves over this sh&^?"  We quietly picked at the plate waiting for replacement burritos to arrive.  Once they did, sis & Ella b didn't even care. We debated over tip or no tip, the waiter didn't believe my pineapple claim, & I learned a valuable lesson: HEED THE YELPS FOR THEY DO NOT LIE! the service is pretty bad. the two times we went there, it took forever to get our orders and our food...and they always messed up atleast two things.and once, i was there with my boyfriend and the waiter came up to us and was all "how can i help you ladies today?" and my boyfriend turned around and was like "uhh..." and the waiter got all awkward.but that was just funny. This place is notorious for terrible service(if you get any at all) I tried this place a couple of times and every time I was flabbergasted at the lack of attention to the clientele. Even more I was amazed at the lack of a PERSON there to not be paying attention to us. The firs time I went there we stood in the doorway for over 10 minutes before anyone even came to seat us. 15 minutes later we were seated at a table that pretty much had a big velvet curtain resting ON it, I felt like we kids again hiding in the curtains. Someone asked us about drinks, we ordered and then waited...and waited....and waited. The person who took our drink order wasn't even the server. We never saw them again and consequently (seriously) never got a server to our table. Naturally we got up and left hungry and frustrated and fairly amazed that it was THAT bad. The second time we decided to see if they had figured it out yet, we again stood in the doorway and waited to be seated...and waited...and waited...and waited. No one EVER came to seat us! I thought 'oh my lord'.  That was it for me, since then I have consistently heard of their terrible service when service is even available. So this place is on my official blacklist. I'm a huge fan of their seafood salad.  I was a little sad though that they quit putting fried calamari on it, but it still tastes yummy.  I think other food items are fine, nothing so spectacular to me.Service can vary depending on the server.  And do not expect to get a wonderful service on Sunday morning-afternoon.  There's gonna be a long wait no matter how much you bitch about it.  My favorite breakfast is the Chicken fried chicken (fried steak) ... hmmm tasty.Also, their drag show is a must-see.  It was way more exciting and well  constructed than I had expected.  $10 per ticket, but get their early to get a good seat! A great place to break in my new Prime card with an old friend. Service was fine, and we didn't mind any small waits. I had the chicken gyro which was very good, my friend had a salad which, frankly, I didn't even look at we were so busy catching up. Lunch for two after Prime card + tip (on amount of original bill, of course) = $18. Gotta love it! Not a place I'd ever go back to.  Reasons:Waited at least ten minutes for a table.  The one person on duty at that point seemed overwhelmed by her three tables (not counting us) and just plain annoyed to see us.When we were finally instructed to take whatever table we wanted (the place was dead...  why couldn't she have said that 10 minutes earlier?), we didn't get water or menus ... or coffee for another ridiculously long time.  I was nearly driven to nibbling the dried out pepperoncini stem I found on our table (um, ick)!!!  Not really...  I was feeling more inclined to leave, but we foolishly kept hoping we'd catch the attention of a server...  finally... it happened!!  A different server who was nice but couldn't seem to remember our orders.My brother ordered the french toast.  The bread was thick...  and wasn't given the opportunity to soak up any custardy goodness...  so his breakfast was disappointingly dry.  Booo!!I had the basil and cream cheese scramble.  Not bad actually.Since I am forced to give at least one star...  We'll just say it's for the third server who handed us our bill.  She brought it to us immediately after we requested it, and made my brother laugh by calling him gumdrop.  :-) If you have yet to experience the displeasure of biting into eggz scrambled with soy sauce, I sincerely hope you are forever spared the sensation.  (Please refer to Ramona R's brilliant review). I had read the reviews about the service here, but I couldn't believe that it could happen to me. I am pretty positive that the servers are told specifically to seem to not care about the guests, as our waitress was almost comically horrible at her job and condescending. It was like we went to a theme restaurant where the theme was forgetting what you ordered and being weirdly bitchy to everyone. The restaurant wasn't very packed but it took a while to get service. As far as the menu goes, they have the same pub fare every single damn restaurant in Seattle has. A list of the same burgers (holy crap, you have a mushroom burger?! I HAVE NEVER SEEN THAT), "American" stuff and salad. I guess if you subsist entirely on Mac N Jacks and bacon burgers this place would be awesome for you. I decided to shake things up and get their meatloaf, which was not entirely horrid. It had been sitting under the heat lamp for a bit too long so that the outside of the mashed potatoes had a crust on them, which lent more credence to my theory of them being bad on purpose. The loaf itself was waaaaaay too garlicky and it was served with mixed vegetables. Honestly. Why do restaurants STILL serve mixed vegetables? I can't think of one person who thinks that they are a totally awesome thing to eat. Oh, can I please have more plain steamed carrots and broccoli?! THAT WAS SO BLANDLY DELICIOUS YAY.I went there for dinner, but I hear that their brunch and breakfasts are better. For a place so popular, it really surprised me how lackluster it was. Meh. Julias is a great breakfast option on Broadway. There's a pretty big vegetarian menu and you can substitute tofu for eggs in a number of 'rancheros' type breakfasts for a fee. I usually get the "tofu breakfast" which I enjoy, whereas a lot of tofu stirfry/scramble plates are mediocre I like this one.The interior is uniquely decorticated, another opportunity to stare at something new. I don't know about the shows they have here, but the stage looks cool with the old heavy theater curtains. I've been here a few times with friends congregating from all over and the servers have been attentive enough to notice that we had new people throughout the meal and kept coming back with water and coffee refills through the meal.  I can see it being a little lax at times, but it's a laid back breakfast show up my book, and whenever I go here I'm in no hurry. I've been going here off and on for years. This is my favorite breakfast spot, and occasionally just a good place to drop in and have a bloody mary. Maybe I'm just a creature of habit, but the atmosphere here is fun and the staff is completely adorable. I love it. P.S. Stop groaning about having to wait during the the morning or lunchtime rush. I mean seriously, welcome to eating on Cap Hill. If you want fast food, go to Jack in the Box.
## 551 Oh. My. Gosh. What an amazing sandwich. Top-notch ingredients really go a long way at Westlake Specialty, Boar's Head meats and cheese make these sandwiches, surely. I just discovered this little gem the other day on my lunch break, just a few blocks away! Hooray! Granted, I can't always drop that kind of cash on a sandwich, my Reuben was around $8, and worth every penny. It's actually best Reuben I've ever had, not that I'm any expert on the subject. The coleslaw was crisp, fresh, and provided the perfect crunch, graciously piled on a heaping serving of warm Boar's Head pastrami. Is your mouth watering yet?I thought that I was going to be receiving a pre-fab sandwich like the ones sitting in the display case, all of which looked equally amazing, but upon my order, I was pleased to wait just a few minutes to have my Reuben freshly-made and grilled on the panini press. What a FABULOUS sandwich!!!Not only does Westlake Specialty have a great variety of sandwiches (and, of course, if you don't find exactly what you're looking for, you can custom order your own), but a wide variety of groceries, fine wine, cheeses, all sorts of snacks and sodas. This is pretty much the ideal convenience store for those ready to spend a little extra cash.Try Westlake Specialty! You won't be disappointed!!!Thanks for listening! Great sandwich with Boar's Head meats and cheese.   Clean market.  Small countertop seating.  Perfect for a quick, healthy lunch.
## 552 Decent food but a dirty establishment.  This place is always crowded during the lunch hour.  Imagine eating inside of a packed King County Metro bus and you have an idea of what a busy lunch is like here.If you are a hardcore germaphobe you probably wouldn't like this place.I made the mistake of sitting outside one day when it was nice and was harassed by every homeless person that walked by and asked for everything from my drink to a handful of french fries to $20. Got my 3 Q's! Quick, Quality, and Quantity! Delicious! Any choice on their large menu is a good choice! Just make sure you bring a bib because it can get messy!Must Try: Beef Gyro and the Balaclava  DELICIOUS and total less than $10! Open late! PS great Cream Cheese Hot Dogs on those cold, drunken nights (stands out at 7:30p!) This place looked good from the outside, but I was pretty disappointed with the food.  The gyro meat came in big, dry chunks - not very tasty.  Their spanakopita filled with almost all onion with only a little bit of spinach - I couldn't eat more than two bites. The best falafel ever, and cheap! I love this place, working downtown it's hard to find a decent, affordable eating establishment since everwhere you looks it's nothing but upscale, snooty booty cafes and wine bars. Nothing beats going here before or after a M's game if you starved yourself enough and avoided the $10 garlic fries. Seen a few concerts at qwest field event center and descended on this place in the later hours. Sitting outside on a warm summer morning around 1 am with tzatziki sauce dripping on your shirt, nothing comes close. I love the wall of sport's memobilia they have, although seeing Gary Payton wearing a Sonics jersey always gets me teared up, R.I.P. supes. Cream cheese kosher hot dogs! Can't get any better. The guy who usually makes them is hilarious and will keep you entertained while you wait for your late night fix. Its great because they are usually there until 1 or 2 in the morning if not later. Havent really tried anything else on the menu because I am hooked on the hot dogs. Always one of my favorite places to go for a quick bite to eat. Will I go back?.....YES. Should you go?.....Only if you are not going to get in the way of me chomping on my meal!
## dirty
## 553 NA
## 576 NA
## 648 NA
## 690 NA
## 691 NA
## 742 NA
## review
## 553 Wow...I can't believe I've lived in Seattle 16  years and just found out about this place. The Bahn Mi pork sandwich came highly recommended and did not disappoint. The fact that it costs only $2.50 is a huge bonus. We will add this to our regular rotation!Oh...they have parking next to the deli as well, which is super convenient. I've had a bahn mi (pork) at just about every vietnamese deli in the ID because I love them THAT much and I want to make sure I'm getting the best one.  Saigon Deli is definitely the best for sandwiches.  I will come all the way from Wallingford, ON THE BUS, for this sandwich, this is how much I love it.  I usually get the spring rolls with the spicy peanut sauce too.  I've always wanted to try other stuff, but I can never bring myself to order anything else because I love the sandwich so so much.  We usually eat in the car because there is no seating, but the curb outside is nice too! As everybody else has said, Saigon is the shizzy!  The bahn mi are to die for.  I love the roast and shredded pork.  Perfect balance of flavor, crusty bread and spice make this one of my favorite sandwiches in the world.  They can get very hectic around the lunch hour, so come early if you can, I promise you wont be disappointed. Saigon Deli is a great place if you're looking for cheap eats including banh mi or Vietnamese style sandwiches and snacks such as che or Vietnamese sweet desserts.  The banh mi is VERY cheap, at around $2/each, and it only takes 1 or 2 to really fill you up.  Aesthetically, Saigon Deli is a hole in the wall but it serves its purpose nonetheless.    Saigon Deli doesn't have a place to seat either inside or outside so you'll have bring your lunch elsewhere in the ID. I am all about eating good healthy, clean food.  I've been coming here for a long time and have been loving their sandwiches and gave high reviews to my friends about it..until recently...One time I walked in to order sandwiches and everything was great...but if you notice...the ladies that make your sandwiches...they wear rubber gloves when they make it..but they also use the same gloves when they take your money and give you change....and sometimes they use that same glove to make another person's sandwich....gross...Happened twice in a row.....that's just nasty.  Besides that...I love their food...but next time Im going to ask them to reuse the gloves...so I give it a 3 star. Quick, easy, cheap, & flavorful grilled meat (had the pork sandwich). Actually I lied, I had two.In and out, bam! bam!... Total cost = $5.00.Seattle, if I ever see you again, come find me in Saigon Deli I heart Saigon Deli. Best banh mi in Seattle, hands down. And so ridiculously cheap. We always get the pork and house special, and can't figure out which one we love more. I'm getting hungry just thinking about it. As good as cheap ($2.50) sandwiches can get.  Sure you can get a better sandwich, but you would pay 4 or 5 times more.  This place is fantastic, they have a much better ordering system than other Vietnamese sandwich shops, the help may not speak a lot of English but they do know how to make a fantastic sandwich.Co-workers have raved about how good the other food is, but every time I go I have to order a sandwich.  Going at lunch time the store will be quite full of people waiting, typically though the wait for a sandwich is less than 5 minutes.  How they make the food so cheap, is beyond me, or I just don't want ot know. AWESOME sandwichesAWESOME spring rollsCRAPPY that they close at 7 AND they run out of spleeng lolls. I was surprised that they close at 6 now...the economy must really suck. Good thing I was able to get there 15 minutes before closing to get some tasty sandwiches. You must get the grilled pork sandwich....or maybe two. It's so freakin good. I'm always dumbfounded that sandwiches this cheap could taste so good...every bite was just full of flavor. You can buy like 4-5 sandwiches for about $10 max. Next time I'm in Chinatown this will be my official place to stop by for some tasty banh mi. Be sure to get there before they close! I remember my parents coming here to get banh mi when I was a kid and I recently rediscovered this place as an adult.  Probably Seattle's best banh mi period.  I'll easily get 5-10 sandwiches at a time and have lunch for a whole week!  Just heat em' up in a toaster oven.  MMMMM!!!  I think I'll go there again right now!  There's no place to sit so come here and grab some for a picnic or take it back to your place so you can eat it in the privacy of your own home, in your underwear!  Hahahahaha! In search of portable chow for the M's game? There is no cheaper/ tastier game fare than a couple bahn mi from Saigon DeliOrder up a couple of these bad boys and toss 'em in your bag/ back pack and head into Safeco. I'm telling you- your neighbors in the surrounding seats will be envious and wish they were as smart as you.I prefer the beef, but the tofu is just as tasty. And if you're really feeling indulgent, snag some of their fresh rolls for a little over a dollar for two. Yes, you really can get 3 or 4 delicious sandwiches and a couple of spring rolls here for under $10...the same price you'll pay for a stale brick o' fries. Like I said, Saigon Deli is where it's at. Your wallet and your waistline will thank you. Saigon Deli in the I-District on South Jackson. Yes, I needed to be that descript with which establishment I am reviewing. There is a Saigon Deli in the Uwajimaya food court and in the U-District and Saigon Vietnam Deli a block away. When I first was looking for this Saigon Deli, I had mistaken Saigon Vietnam for it which makes me appreciate the Yelp picture feature very much. It helps me figure if I am at the right restaurant or not.Having figured out and found the right location for Saigon a while back, I now go to Saigon as my main banh mi supplier. Prices are pretty much in line with neighboring shops in that they are cheap. Particularly like their three ham sandwich (#2) and the shrimp spring rolls with peanut sauce. The service sometimes leaves a little to be desired, but overall Saigon Deli in the I-District on South Jackson is a good cheap and fast place to go and get something tasty to eat. I think Subway has met its new competitor for cheap, healthy sandwiches. Saigon Deli incorporates fresh ingredients and charges no more than $2.50 for even the meatiest option.I had the roasted pork sandwich (#1) and the mixed ham and pork sandwich (#3). The O.G. roasted pork is definitely better. Perfect flaky bread, slices of roasted pork, and crunchy marinated carrots, cucumbers, etc. Their spring rolls can rival any restaurant and are huge for only 2 bucks!I wonder if Saigon Deli has ever thought about franchising... I wasn't sure @ first I got the right  Saigon Deli; I suck @ directions & there is apparently a ton of Saigon Deli's in a 2 block radius?!?.  Anyway, I got 2 gigantic sandwiches...figure I'll save one for later.  As far as price...can't beat a huge grilled pork sandwich for $2.50 or $2.75...whatever, it's cheap.  The bread was crusty & delicious, ingredients tasted fresh and the sandwiches have a little kick in them (they add jalepenos).  The chicken sandwich was less tasty but still okay.  I give this place 3 stars b/c being cheap isn't exactly a rare thing when it comes to bahn mi sandwiches.  The sandwiches are solid, but I've had better.  And I get that Asian places aren't the cleanest and personally I wasn't that bothered by the fact that I saw pre-packaged spring rolls neatly stacked on the table near the counter...I wouldn't buy anything that wasn't refrigerated and had shrimp in it, but that's not my problem.  But mostly can't give it 4 stars b/c I'm a pig and hate having to get back in my car to then drive somewhere else to get pho.  All in all, one of many places you can grab a good lunch. I love this place. I get spring rolls, bahn mi, and various jelly desserts or small cakes- for cheap cheap cheap! The flavours are quite tasty and the price is a hit. It is a mandated stop for me while I am near or on Jackson. Let me start off by saying I have never stepped foot into Saigon Deli so I can not comment on the atmoshpere, the service, etc. I do, however, take an occasional art class at Pratt Fine Art center down the street and Saigon Deli is the #1 food pitstop for myriad starving artists.As result, I have snarfed down their pork bahn mi-s on numerous occasions. There's always someone in my class who makes a Saigon Deli run for sandwiches and spring rolls and just about everyone puts in an order.What amazes me, after 5+ years of attending Pratt, is in that time period the price has remained low and the quality has remained consistent. I mean, how do they do it? At about two smackeroos per sandwich, the price/quality/ taste ratio is out of this world! Unbelievably so. I feel like I'm getting away with something.What you get is about a 9 inch long piece of French bread that's not too soft, but not too crusty either. Your lap isn't showered with crust dust when you take a bite. Inside is some mayo that I believe is slightly flavored with lime or lemon, a little bit of a pate spread of some sort, a hearty layer of tender and mouthwatering pork and then layers of carrots, cool crunchy cucumbers, white onions, cilantro, and jalapeno peppers. With each bite there's insane layers of flavor. Seriously, it's like flavor fireworks going off in your mouth. It so fresh and full of mouthwatering flavor that I  can not express how delicious it is. This sandwich is to die for. I've tried other bahn mi sandwiches at other places but hands down this IS the best place in Seattle to get bahn mi. My only beef is I don't live close enough to Saigon Deli to go on my own. Otherwise, I easlily would be there at least two to three times a week if not more.If you're ever in this side of town and are looking for cheap, quick, good eats I highly suggest stopping by. I haven't tried any of their other menu items so I can't speak for those but if they're anything like the pork bahn mi then you won't be disappointed. Wonderful, flavorful, inexpensive food here. I picked up 2 Bahn mi BBQ'd pork sandwiches, bitter melon soup, rice and pork dinner, 2 French coffees and a hombao for $13! How great is that. The sandwich bread is fresh and crunchy and if you pay cash, you get a better price. I love it! Thanks Seattle Yelpers, for leading this visitor to the Bahn Mi at Saigon Deli. After a hard truck up Jackson against a cold March wind, I was ravenously hungry.  Oops...I discovered that it really IS just a deli, there are no tables or chairs. After ordering my barbecue pork Bahn Mi, I asked the owner if there were any nearby parks, since I wanted to sit down and eat.  He insisted that I stay and sit on a plastic drum behind the counter.  This afforded me a fascinating view of the kitchen, which had two huge covered cooking pots sitting on an flaming brazier. The sandwich was a chewy, fresh, mouthwatering blend of savory (pork), hot (jalepeno) and cool (mayonnaise, cucumber and cilantro). Wish I could have a dozen shipped to Chicago right now. Killer Banh Mi Thit!  This is the only place in the city I buy em.    I like the shu mai personally and the classic thit nuong.  Lines are getting long but they are really fast at making them.  Good strong Cafe Sua Da (Vietnamese coffee w/ condensed milk) and also Che (desserts).  Great for people on a budget and wanting a really good filling sandwich.  Nice job guys and can u somehow add some tables please! :)  hahah...
## 576 Oh Kate's how I love thee.This bar is fabulous. From the time we moved into Wallingford, until the last second before we moved out of state this was our go to bar. And unfortunately, not many bars measure up to them.Happy hour is great! Tasty bar food and it is all half price! Plus they have drink specials too! Lots of good "certain day of the week" deals, fun events (Trivia with Billy is SWEET) and great bartenders. Free pool and darts. Lots of seating!Seriously, I miss you Kate's Pub. You are a great, great bar. Can't complain about a thing here.  Half price food for happy hour can't be beat! Service/Bartenders are the best in town and if you are looking for karaoke, look no further than Kate's.  Best holiday parties, great beer selection on tap, food is awesome, I could go on and on.  To be fair I'll try to come up with a criticism...it would be nice if the bathrooms were bigger. I love this place! ...has to have one of the best happy hour's ever. 1/2 off EVERYTHING (food-wise) on their menu every day 4-7 pm. seriously awesome. and the food is good! *macaroni and cheese (cheesy deliciousness) *hummus plate: housemade hummus was surprisingly good, served with warm and soft pita triangles, carrot and celery sticks, roasted red peppers, tomato slices...and maybe something else that i've plum forgottenon a previous visit about a month ago the nachos here were pretty good: there was enough cheese down to the last chip (which is an unfortunate nacho rarity!) and the vegetarian version was full of black beans, salsa, sour cream, and jalapenos, however the cheese wasn't all the way melted and they were only lukewarm; that was a bummer. eric and eric were the bartender-servers the last two times i was in and they are very good: super attentive and friendly. love all the pictures of "kates" on the wall. obviously a lot of regulars here (alot of UW-ers) and it's easy to see why - it's a very comfortable, laid back, fun-and-friendly-people-type place with good food, great food and drink specials, and events. front room was a damn fine place to watch the apple cup this year, too, and could have only been better had the COUGARS won :) Decent beer selection at decent prices and a bar with plenty of seating in a surprisingly large space. Plus, free darts and free pool (lumpy pool table, but still, free). Couldn't be better right? But wait! The constant smell of greasy food being cooked! Gotta admit, I haven't tried any of it, but it sure smells good and doesn't look bad either. I like the vibe of this place, especially at off hours and on weeknights. It can get crowded during prime time, so be warned. If you are here on a weeknight, the bartenders will roam around to check on you and likely remember your name if you open a tab. Not to shabby.My one problem: The Juke Box explicitly states "No Journey," and yet Journey was playing while I was there. Maybe I missed the joke, but I would have been happier if it wasn't a joke!Make sure you get pitchers for the table or stick with the cheap well drinks. You'll save a lot over pints and fancy drinks. Of course, this advice works for just about any bar... All the same, good place. Two things you need to keep in mind when visiting Kates... 1) If the next day is a day off classes for the U, DO NOT COME HERE! There will be about 10 thousand frat boys and their twig like girlfriends there. 2) they have AWESOME karaoke, great song list! The DJ is great and super nice, the list can get pretty long but he'll let you know as soon as he can if he thinks you can't make it before the end of the night.Beer, cider and liquor make for a pretty good bar (not to mention one or two OUTRAGEOUSLY cute bartender boys)... I try to be DD when I come here so I never drink much if at all and they usually will comp me a coke if they realize I'm the lady behind the wheel that night. I've gotten pretty harassed by a few of their customers (ie drunk frat boys trying to pick up on this hot mama) and it's gone pretty unheaded but seriously the guys that work here are too busy to pick up on everything so just be fair warned when you go that this place can quickly turn into a meat market without you even noticing it and before long some drunk guy is practically shaking you he's leaning on you so hard slurring out that he loves your glasses. A-M-A-Z-I-N-GAnd what is more comforting than well-made pub fare? When it is half-priced, of course! Their hh is daily, but even with nornal prices, it is very affordable. The staff is friendly, attentive, and pour a mean pint. Wed night is Karaoke, and  with a crowded house, get there early to grab a seat. Dear Kate's,I love you. You are my favorite bar in all of the land. That said, it makes me sad that I am so far away from you now and can't enjoy absurdly strong Jack & Diet's and karaoke every Wednesday. Your happy hour is the best deal ever and your meatloaf sandwich is amazing. I can't wait until I am living in Seattle again and can once more frequent this lovely place.Love,Calana. my love for kate's has grown exponentially. Half price food during happy hour, which is every day for a generous amount of time. A staff who remembers your face, if not your name. And generally just good people, serving good beer, and big pours of shots. Pro-tip: Saturday is $3 Jameson night.
## 648 This was the first place I ever had papusas.  Well, you know what they say, you never forget your first.  Of all the places I visited, by far, my favorite papusa place.It can get crowded here, but forge on!  I usually get a veggie papusa and a meat one of sorts.  They are both equally good.  I've never had one I didn't like. Great food, great prices, great service. The pulpulsas were hot, fresh, and made to order. The combo platters can't be beat for $7.50. Excellent hole-in-the-wall place I'll be back to again. I like their mahi mahi taco a lot - big chunks of lightly fried fish topped with refreshing salsa and in doubled wrap corn tortillas.  However, I didn't really like the pupusa - I felt that the tortilla lacked texture and the fillings were pretty bland.  I'm not a pupusa expert though, and it's likely that I just don't like pupusas in general.I'll be returning for the tacos though! I don't come here very often because it's a long walk, but every time I come here I have forgotten how absolutely delicious it is. The pupusas are the PERFECT blend of soft, crunchy, creamy, cheesy, sour, warm, meaty, and cool. Nothing like it in u-district. We intended to go to a nearby restaurant but it was closed. Decided to give this a try because I'd never had Salvadoran food before. First impression upon walking in: you can smell oil frying, but it smells extremely fresh and reassuring, the people who work here are friendly and efficient. All the food was wonderful and interesting though I didn't care for the cole-slaw type salad, it had a lot of thyme in it and was very spicy. Good food, love the Papusas so so good have to try it! The 1st couple of reviews I read here pretty much reflect my sentiments.  This is definitely a "hole-in-the-wall" with order-at-the-counter" service.  It is cozy, however, and the food exceeded my expectations, which were fairly high.  I had the sampler with one taco, one papusa plus rice and beans.  There was also a tasty condiment (salad?) on the side which reminded me of both coleslaw and sour kraut.   The meal was filling, and not too greasy. Veggie Burrito was satisfying. Didn't rock my burrito world. Waiter was nice. Great vegetarian menu; friendly staff, fantastic food. My mouth is still burned and my stomach is still full of pupusas! Such a fantastic eating experience...so many flavors and excellent variety. I ordered vegetarian combo #2 (pupusa, fried yuca, slaw, and empanada). A-MAZING - every single thing. The yuca was the least flavorful, but when dipped in the red or green (or both!) salsas, it came it life. My fiance had the a chicken pupusa and pork tamale and was equally happy. Excellent value for the price, not busy, ample (pay) parking. Here is where I confess that I'm a CA transplant who really misses  the incredible abundance and variety of cheap, delicious ethnic food in San Francisco and Oakland.  Please don't hate me.  The point is, Guanaco's makes me a little less sad for the food I left behind.  Their pupusas are delicious and come in both rice flour and masa versions.  I always ask for extra curtido,  a tangy/spicy cabbage slaw that they make daily with home-made pineapple vinegar.    Their fried plantains are yummy too! Food was flavorful , homemade hot-sauces were lacking in flavor (which was very disappointing) , portion was mediocre at best. I have no idea what justifies the price. Probably will not return.
## 690 I'm partial to hippie-food. No wait, scratch that. True hippie food is squash and wild rice with cumin on it served in a drafty house with crappy plumming...in Oregon. I'm partial to food that is hippie-esque, like whole grains, taters with the skins on, whole rainbows of spices in one dish, sprouted stuff, and things that taste organic even if they're not. So that makes Sound View the place for me. Their huevos ranchers estan muy bueno - sabes? Only downside - if you can call it that - is that their portions are too big. I'm a pretty big guy with a pretty big appetite. I eat entire watermelons by myself. But I still have to split whatever I'm getting at Sound View with a friend.
## 691 This food is amazing. I decided t ogive this place a try after newly moving within their delivery zone. The food was delievered promptly... although I thought it strange the delivery guy didn't even offer back change when I over paid him (not that I needed it). I was extremley impressed with the food. The individual meal I ordered at $10 easily fed the two of us. The Mongolian Beef was seasoned perfectly, and spiced as ordered. It was mighty tasty! The Sweet and Sour Pork was slightly rubbery, but the sauce and huge chunks of pineapple made up for it. The Hot and Sour Soup is A+++ and veggie eggrolls were good, too!I am definatley ordering again! I think chef at wok is the best Chinese food places in town.  If you're in their area give them a try.  The portions are fair/large, and the food tastes fresh.
## 742 I have a bad habit. I often go long periods throughout the day without eating, just so at the climax of hunger I can spend all my calories... in a huge blaze of glory!!!!!!!Belle Epicurean is a great place to do this.When I'm downtown shopping literally until I'm about to drop, there's nothing like getting a revival in the form of flawless pastry and strong coffee. Yesterday I was in desperate need and found solace in a rich cappuccino and, more importantly, brioche with honeyed pears, toasted almond slivers, and rum-soaked raisins. I'm sorry, WHAT?! How dare they. Plus the chef was trained at Le Cordon Bleu and has all sorts of rave reviews on the wall along with her diploma. Impressed? Because I am. I sat there and devoured my 700 calorie snack (served on delicate, beautiful china, obviously they don't know my clumsy ass because they should not be trusting me) in front of a huge mirror. Front-row seats to my gluttony, awesome.Hint: It's only open til 2 on weekends, also it's tiny and easy to miss. So keep your eyes out for the statue of the fat French chef in the entry (or, the statue of what I will eventually look like if I give into every Belle Epicurean craving... I want to go back right NOW...) The almost-hidden-bakery!  On my croissant tour of Seattle, this place gets full marks for A.) being downtown, B.) having a lovely environment, and C.) having delicious pastries.  The almond croissant I had was quite good.  (Made more authentically French by not being oversweet, but I actually like oversweet.)  While not perhaps my favorite croissant in the city, it was WARM which ups the verdict, places that bake croissants on-site in the middle of the day are few and far between.  I believe this bakery's niche may be in their brioches, which I hear are excellent, and they even had Buche de Noel in the cabinet which made me feel like having a little French christmas right there and then!The cafe is small and elegant (watch out you'll miss it), the next time I go, I fully intend to have a cup of tea and hang out a little.  The proximity to my work makes it the most likely to be hit up on a regular basis. Holy crap YUMMMERS!After seeing the reviews of this place I new I had to stop in on my last trip to Seattle. I wanted to basically eat everything they had in the place it all looked so yummy.My husband and I ordered three things. The classic pecan bun, a potato and rosemary croissant and the ham and gruyere flalalfthaklad thingy!They were all very tasty and our favorite was the ham and cheese thing.My husband had a coffee or a "drip" as they say there and I had a hot chia. My husband wasn't a big fan of the coffee but that didn't stop him from having three cups.We ended up going there two out of the three days that we were there. bri-oche (bre-osh,-osh)n.  A soft, light-textured bread made from eggs, butter, flour, and yeast and formed into a roll or a bun. heav-en-ly (hev-uh n-lee)adj. Sublime; delightful; enchantingHeav-en-ly Bri-oche (hev-uh n-lee; bre-osh,-osh)adj. n.1. Pumpkin and Sweet Potato (Seasonal Bun)2. Pear Almond Bun with Rum Soaked Golden RaisinsBon appetit! "belle epicurean...belle epicurean...belllleeeee...epiiiii..cuuuureaaaaan..­"i found myself singing the name of this lovely bakery/cafe allllll day  (and a few random days) after experiencing the best taste i've had in a long time! (no joke, my bf can vouch for how annoyed he was of my song) smoked salmon baguette ($7ish) + plain crossaint ($3ish)  = heavendon't you love it when you eat a crossaint and the flakes get all over your shirt -- and you don't care! *licks lips*"...belle epicurean...belle epicurean!"i think they close @ 6 on weekdays, 4 on saturdays -- and i don't know what time sundays. but that means 7 days to make my heart melt! Phenomenal.  Find any excuse to go here:  shop downtown, pretend you're shopping downtown, whatever...just go.  Now.  The pastries we tried were ham/Gruyere and potato/rosemary croissants.  Quite frankly the best pastries we have had in town yet. This little bakery hits the spot with their savory and sweet pastries and coffee.  We tried the potato rosemary brioche, spicy corn and cheese croissant, and the pear cranberry pastry - all delish!
## dirty
## 741 NA
## 742 NA
## 743 NA
## 744 NA
## 745 NA
## 746 NA
## review
## 741 3.5The pho here is SO cheap!  I ordered a small for $4.95, and couldn't even finish that.  The broth's flavor was good and they didn't skimp on the meat.  Service was quick, and they accept credit cards for orders over $10.The free cream puff is a plus, but next time I'd definitely pass on that.  Mine was stale and not worth the extra calories.Great place for a cheap meal! Went here back during Spring Break so it's been a while but the food was good. I found the prices were also pretty cheap considering the portion sizes!The complimentary cream puffs were a definite treat. I will never go back to this restaurant!! They're more expensive than other Pho restaurants, they charge you if you use your credit/debit cards, and their pastry is always a little dry. Tran Brothers Pho a few doors down from is much better and even a little cheaper. Their pastry is always fresh. I just recently moved to Seattle from Oregon. I've gone to this place twice since living in the Ballard area. The first time I came I order pho ga (chicken) to my surprise it was pretty decent. There was an abundance of chicken and cheap! The second time I came I order the the vegetation pho. I would prefer non-fried tofu and a little more veggie, again I can't complain on the price. Next time I go I'll stick to pho ga.
## 742 I have a bad habit. I often go long periods throughout the day without eating, just so at the climax of hunger I can spend all my calories... in a huge blaze of glory!!!!!!!Belle Epicurean is a great place to do this.When I'm downtown shopping literally until I'm about to drop, there's nothing like getting a revival in the form of flawless pastry and strong coffee. Yesterday I was in desperate need and found solace in a rich cappuccino and, more importantly, brioche with honeyed pears, toasted almond slivers, and rum-soaked raisins. I'm sorry, WHAT?! How dare they. Plus the chef was trained at Le Cordon Bleu and has all sorts of rave reviews on the wall along with her diploma. Impressed? Because I am. I sat there and devoured my 700 calorie snack (served on delicate, beautiful china, obviously they don't know my clumsy ass because they should not be trusting me) in front of a huge mirror. Front-row seats to my gluttony, awesome.Hint: It's only open til 2 on weekends, also it's tiny and easy to miss. So keep your eyes out for the statue of the fat French chef in the entry (or, the statue of what I will eventually look like if I give into every Belle Epicurean craving... I want to go back right NOW...) The almost-hidden-bakery!  On my croissant tour of Seattle, this place gets full marks for A.) being downtown, B.) having a lovely environment, and C.) having delicious pastries.  The almond croissant I had was quite good.  (Made more authentically French by not being oversweet, but I actually like oversweet.)  While not perhaps my favorite croissant in the city, it was WARM which ups the verdict, places that bake croissants on-site in the middle of the day are few and far between.  I believe this bakery's niche may be in their brioches, which I hear are excellent, and they even had Buche de Noel in the cabinet which made me feel like having a little French christmas right there and then!The cafe is small and elegant (watch out you'll miss it), the next time I go, I fully intend to have a cup of tea and hang out a little.  The proximity to my work makes it the most likely to be hit up on a regular basis. Holy crap YUMMMERS!After seeing the reviews of this place I new I had to stop in on my last trip to Seattle. I wanted to basically eat everything they had in the place it all looked so yummy.My husband and I ordered three things. The classic pecan bun, a potato and rosemary croissant and the ham and gruyere flalalfthaklad thingy!They were all very tasty and our favorite was the ham and cheese thing.My husband had a coffee or a "drip" as they say there and I had a hot chia. My husband wasn't a big fan of the coffee but that didn't stop him from having three cups.We ended up going there two out of the three days that we were there. bri-oche (bre-osh,-osh)n.  A soft, light-textured bread made from eggs, butter, flour, and yeast and formed into a roll or a bun. heav-en-ly (hev-uh n-lee)adj. Sublime; delightful; enchantingHeav-en-ly Bri-oche (hev-uh n-lee; bre-osh,-osh)adj. n.1. Pumpkin and Sweet Potato (Seasonal Bun)2. Pear Almond Bun with Rum Soaked Golden RaisinsBon appetit! "belle epicurean...belle epicurean...belllleeeee...epiiiii..cuuuureaaaaan..­"i found myself singing the name of this lovely bakery/cafe allllll day  (and a few random days) after experiencing the best taste i've had in a long time! (no joke, my bf can vouch for how annoyed he was of my song) smoked salmon baguette ($7ish) + plain crossaint ($3ish)  = heavendon't you love it when you eat a crossaint and the flakes get all over your shirt -- and you don't care! *licks lips*"...belle epicurean...belle epicurean!"i think they close @ 6 on weekdays, 4 on saturdays -- and i don't know what time sundays. but that means 7 days to make my heart melt! Phenomenal.  Find any excuse to go here:  shop downtown, pretend you're shopping downtown, whatever...just go.  Now.  The pastries we tried were ham/Gruyere and potato/rosemary croissants.  Quite frankly the best pastries we have had in town yet. This little bakery hits the spot with their savory and sweet pastries and coffee.  We tried the potato rosemary brioche, spicy corn and cheese croissant, and the pear cranberry pastry - all delish!
## 743 Chicken- moist (I am using my least favorite word to describe it, that is how good it is).Rice- PerfectTeriyaki Sauce- DivinePot stickers- MMmmmmThis is the ONLY teriyaki joint I will go to anymore. I have searched high and low in Seattle for the perfect chicken with the perfect sauce and this is it. They were kind of new when I first went in (what appeared to be the opening week)...but they worked the kinks out (staffing, not food) and it is a pleasure. The portions are huge, it always takes me two sittings to finish it all off (plus I ask for the cucumbers on the side which makes room for more on my plate :)I actually just finished off my takeout, and had to write a review...... Only have had the spicy teriyaki and its been good both times. I like the sliced cucumber salad they serve with it, it balances out the heat.a bit expensive for some yaki if you ask me, but not by much.. Good chicken, just beware of the Daily Special/Lunch Special mix-ups.Always ask if your Teriyaki comes with Gyoza. Don't assume it does.
## 744 Somtam Thai is a newcomer on the Ave. and the food is great! The owners have a lunch place downtown and have expanded to this location in order to spread the love. The prices are totally reasonable and the hospitality and generosity of the owners is tough to beat. The usual staples are all great but what makes their menu unique is that they offer four kinds of Som Tam (shaved Papaya salad) as well as a bunch of other dishes that aren't as common on American Thai menus like three variations on Laarb; one a warm chicken salad mixed with shallots, mint, scallions, lime juice and roasted rice powder and the others with pork or sliced beef. They also serve Gai Yang, a northern-style grilled chicken that has become ubiquitous throughout Thailand but isn't commonly available at most restaurants here in the states. Make sure you get some sticky rice to go with any of these dishes as it is the traditional accompaniment. I also love any of their soups and the Pad See Yew and Garlic Beef/Chicken. One thing to note: the owners are used to accommodating American tolerances for spicy food so if you aren't a complete wuss I would definitely recommend going higher on the spice scale if you like a little kick.
## 745 The BEST prices on Japanese food stuffs around town, beats Uwajimiya easily.  Staff actually speaks Japanese, rare these days in the Seattle area.  Small on the inside but big on heart and service.  Plenty of fresh choices at the deli the bento boxes are #1 in my opinion in flavor and selection.  Flavors remind me of how my Aunt used to cook for us, old skool baby, yeah!  Their lunch counter is served up hot and fresh, no leftovers from the day before but get there early for the best choices.  Portions are good and the choices suit almost everyone and to top it off, the food tastes really GOOD!My kids luv to make the trip down to this store as the selection of Japanese cookies, cakes, candies, beverages, and snack items are varied and priced just right.  Personally in my opinion the isle of pickled veggies is amazing!Worth the trip to Georgetown to try it out, even if we live in Shoreline! Uwaji is rip off, but Maruta is friendly price! I love their osouzai too ...!
## 746 Decent pho & spring rolls. The large bowl is way too big for one. Good lunch spot. My girlfriend and I recently discovered Pho Viet Anh.Since discovering this jewel, we have dined there over a dozen times.  This is one of our favorite restaurants.  It is affordable, fast, friendly, and an all around great dining experience.Henry and Carol own the restaurant and obviously take great pride in their establishment. The restaurant is cozy, in the evening the lighting and tapestries create a romantic environment.When we visit, Henry greets us by name and makes us feel at home.  We like to start with a pot of hot tea while trying to decide what to order.  In our many visits, we have ordered most of the vegetarian and chicken dishes. The chicken and vegetarian Pho are delicious.  The Vegetarian Fresh Spring Rolls are amazing.  The chicken and vegetarian curry dishes are to die for.  Yesterday I ordered the Lemongrass Chicken, which I loved.If you are looking for a great place for Vietnamese food, this is the place. The Pho Viet Anh is a place I sometimes go to have Pho Ga, drink some tea, and read a little. I do this because the staff is very friendly and accomodating, and always have the courtesy of making certain you have fresh tea. The yellowish decor is nothing much to speak of, but it's quaint and suffices. The exterior of the building on Roy is a bit confusing-- it's a bit of a hole in the wall, but it is right next to Thai Heaven. The key difference with Pho Viet Anh's soup broth is that it is not overly salty, it seems to be richer, tastes just like mom may have made it, and I have verified that it does not contain MSG. While I did not take it to a lab, I do have a friend who went there with me and she is very allergic to MSG, and she had no problems whatsoever. I do like the spring rolls-- they are fresh. There's the typical Sriracha sauce available (also refered to as "rooster sauce" or "hot cock sauce") for those who like to burn their mouths and intestines like I do. Of the 3 Pho places I usual eat at, this one is the least expensive and is by far the friendliest.I like good Pho, and good hospitality, and don't like to feel rushed. If this is the kind of place you seek, then I encourage you to try it out.
## 'data.frame': 200 obs. of 2 variables:
## $ dirty : int NA NA NA NA NA NA NA NA NA NA ...
## $ review: chr "Let me start off by saying that this place is located on the corner of Dexter and Hayes.  This little fact has been a sour"| __truncated__ "Nice place.  Smallish, low key, comfortable decor, but also nice enough to dress up for if you like.  Not that we did"| __truncated__ "Everything I could want in a neighborhood pizza place. Solid east-coast-style pizzas, cold beer, and full tilt ice cream! Plus,"| __truncated__ "Mediocre food and decent service. Why don't they get a better restaurant in place of this place? I title this \"Holy s&*, s"| __truncated__ ...
## - attr(*, "comment")= chr "glb_newobs_df"
## dirty
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## review
## 1 I'm only giving this place 5 stars for the severe nostalgia it invoked when I walked in the door.  Growing up as a vegetarian in San Jose, CA, the epicenter of the Vietnamese community in Northern California, my family and I quickly leaned on the support of the local tofu deli and fell in love with the variety of foods they offered.This place stands up to the competition down South, I believe.  The offer up fresh foods, like red curry, stuffed bitter melon, stir fried mock fish (mm...), bean curd egg rolls, regular egg rolls, a variety of bean/legume based desserts, and fresh tofu and soy milk!   ahh!  out of control.  A vegan's paradise!  The Banh mi is also pretty delicious and I think 2.75 a pop, however prices are not listed and that's my estimate from a breakdown of a bill of $19 for a ton of food.Caution for Vegans though - Banh Mi traditionally has mayonnaise, so just ask for a sandwich without any and you're good to go. I LOVE THIS PLACE! 5 STARS!!Let me start by saying this deli is ALL VEGAN!  Being a vegetarian white person who doesn't know any Vietnamese, it's awesome and reassuring to go into a Vietnamese deli and be able to order anything and know it doesn't have hidden animal products lurking within!I love ChuMinh's tofu and have been eating it for years, so I'm very excited that they finally opened a deli serving tofu-laden vegan dishes!I've even gotten my meat-eating boyfriend addicted to this place.  We usually order the 2 entree dinner combo ($6) with red rice (super delicious!!) and we pick 2 entrees that look good that day.  So far I've had the mixed vegetables, tofu curry, tofu and greens, and a special tofu thing with pineapple and mushrooms and vegetables that they had made as a one-day experiment.  We also order banh mi ($2.75) to save for lunch the next day.  My favorite is the spicy lemongrass tofu.  Last time we went on a weekday at about 7:30pm (they close at 8) and they were out of lemongrass tofu so we got onion instead which I was happy with.  I've also gotten the tofu spring rolls ($2) which were very delicious!  The $6 dinner combo is easily enough food for 2-3 people as it comes in a full-size compostable take-out container with generous servings.  This place is pretty tiny and they have one dining table in the middle.  It's best to get your food to-go and pay with cash.  They take credit cards (with a $0.50 charge for transactions under $5) but I think their machine is very slow and/or doesn't work sometimes.  I tried to pay with a card once and she said the machine wasn't working properly so I ended up paying with cash instead.  Taxes are built into the prices, and be sure to throw some cash into the tip jar. Good food! Tasty! Good value. Good for you. Small hole in the wall tofu cafe I was surprised how good it was. Different and interesting dishes.Been there twice now!! Best vegan bánh mì I've ever had in a deli/restaurant.  $2.75 with hearty serving of tofu.  Great! I recently popped in to this place and now I'm obsessed with their red curry.  One lady that helped me last time was really nice, and everything came out SO CHEAP.  I never had to pay anymore than 10$ and they give you a ton, so i've made several meals out of one take out.  good deal! All the dishes were delicious and the people working there were delightful!  Can't wait to tell all my friends about one of my new favorite vegetarian spots. Super awesome tofu banhmi; they let me try it with the chili, lemongrass, and green onion all in one and it was great. Someone else described the ladies working there as your "favorite Vietnamese aunt" - never had one, but that's how I'd imagine them to be. Seems you can't go wrong with ordering the stuff there, it all looks really good. The lemongrass tofu's so good I like eating it straight, and the curry tofu dish I had was pretty tasty. Want to go back and try their veggie pho. I love this place.  My wife and I eat here all the time
## 2 "ewww.....bleh..."  sums up my bao experience at Yak's.  I made a detour over to Fremont after Yelping "Bao"   I bought the last two BBQ Pork Baos.  It held promise.  The steam wafting out the sweatin' Bao Cabinet and the site of the last two seemingly soft  mounds of the baos made me salivate.  I take my baos home, opened my container, sigh and sunk my teeth into the bao.   Then...a crashing disappointment.  This is the most DIGUSTING bao I've ever had.  What is this ground pork chinese sloppy joe $hit?!  It is so bad, I immediately closed the container and off it went into the trash shoot. :-( Now I'm off to forage for more food.  I wish I knew how to cook.... This is not the Yak's I used to frequent in the 90's, but then again the Still Life cafe up the street is replaced by some upscale nonsense. Not bad teriyaki I guess, seen better days though in my opinion. Teriyaki was  pretty tastey, the rice Was just ok. They have wifi if you ask for the password
## 3 This is my fallback dim sum place when the wait at Jade is too long.  Sun Ya has much better congee than Jade. it's rare when the service makes me pipin' mad.  hey, i think i'm a conscientious guy.  i pick up litter at parks, i pet stray cats, and i put  many a stripper through college.  at first, this place was cool.  parking is free, the dim sum oilier than a teenager's face, all good qualities.  the ladies zooming their carts along the narrow spaces between tables, more entertaining than teatro zinzanni.  i'd like to see one of those acrobats try what these dim sum cart ladies master the craft of pushing steaming piles of dumplings and zoom away.  it's like selling cigarettes to kids, they make it look so easy!  my gripe is the final bit of service.   i'm covered in ink as in tattooed.  my friend has them too all over his bald head.  now, in some Asian circles, particularly the old skool way of looking at things, tattoos are frowned upon.  so is having your wife's feet unbound and not looking like little moon crescents(see foot binding).  yup.  walking wives are a shocker.  in any case, the old fart puts the check on our table while we're still EATING.  and stands there.  and stands. and waits.  stands. waits.  what is he doing?  i don't know.  you tell me, dear reader.  looks like he wants my friend and i to pay the bill and leave!  for what?  having tattoos?it turns out my friend is an IT manager and i'm in the similar field, cuz that's the only way we could afford tattoos.  the shit is fucking expensive.  it's too bad, cuz i was digging the place up to that point.  we were even behaving properly, we were talking about work, for chrissakes.  if you're going to judge me for my looks.  look at my ass.  it's like prince's and his  is iconic.  do you remember his pants at the MTV music awards?  when i first saw it, i'm surprised i didn't turn into a pillar of salt!my friend wasn't mad.  he was like, 'i've accepted that having these kinds of tattoos will mark you'.  maybe i took it a little too personally, i was raised in a old skool asian household.  and i fucking hated it.  i believe wives deserve the right to walk.  i wanted to tell ol' maggot dick, 'hey, pal, welcome to america, people with tats are human too!  i'm not an animal!  (weep) i'm not an animal..." I think this place deserves a higher rating!  Have not been here in forever but chose this in Chinatown because it's close to our grandma's nursing home. The service is excellent, attentive and staff with the dim sum cart was friendly enough.  The food was good - some items we like are not offered here but it was still a good meal.   Parking can be challenging but that goes for everywhere in Chinatown - they have a small lot.  We found street parking ok though around 11:30m on a Saturday morning.Only negative is that they don't have those lazy susans for the large round tables.  Getting to all the dimsum is challenging but luckily we get two of almost every item!  :-) In terms of dim sum I was a bit disappointed. But due to the rapid turn around the stuff was fresh.  There are only 4 types of carts: Steam, Dessert, Fried, and Congee.  The selection was really dismal.  They had the usual har gao, sui mai, chicken feet (white), bean curd, noodle wrapped shrimp, pai gu, eggplant stuffed with meat, kai lan, beans, fried rice, lotus wrapped rice, etc.Still not great if you're looking for dim sum...I would go for some where else better recommended.  Also the parking lot's tiny, but the parking's free. Came with here with a group of friends and we sat at the bar so we can watch the football games.  There's no carts rolling by at the bar but you can order dim sum from the menu.  My friend is a regular here so service was good and the lady came back with all fresh dim sum, can't beat that.  Prices seem lower compared to other places in Chinatown. Haven't been to Sun Ya in forever but I was craving dim sum and didn't want to wait an hour for Jade Garden or Harbor City. The food is pretty good and CHEAP!!! Dim sum for 4 was $17.50 and we were stuffed. Service was ehh.... Check this place out if you want cheap decent dim sum. But if you want good dim sum, just walk on up the hill and look for the crowd of people standing outside of restaurants. I have been here a few times and the one reason I think I keep going back is the pea vine greens in garlic sauce. Absolutely my favorite dish and not all that common.  I have had the chicken corn soup and enjoyed that as well. Tonight we had the pea vines, tomato beef noodle soup and chicken egg foo young.  Well the pea vines were excellent as usual.  My boyfriend really enjoyed the soup but then he loves anything with tomato and beef in the name. I have to admit though, it looked good and the one taste I had seemed pretty tasty. Now the egg foo young.  I don't know how many of you have ever tried this. It is not that popular a dish and is not even on the menu in many Chinese restaurants. I believe it is something traditionally made with leftovers and not considered something you would serve to guests or customers.  Nevertheless it is found here and there and has always been one of my favorite splurges.  It is typically a greasy omelet with beans sprouts, meat or fish and other vegetables (onions, peas, water chestnuts, etc).  Served with gravy it is not recommended for those with cardiac disease.  Well, that is what I had in mind when I ordered.  What I was served was something different. It seemed to me that they added some sort of starch to the omelet part, it seemed more doughy or perhaps started out like some sort of batter.  The egg was not really detectable except for a yellowish color. It had no taste until I put a lot of soy sauce and white pepper on it after which it was edible.  Huge bummer.  I regret not sending it back or at least asking "what is this?"Yes the windows are filthy and there is nothing fancy about the bathroom but let me know when you find a Chinese restaurant in the international district that is different.  The service is so-so.  Parking almost always available and is a huge plus. In fact I think were were headed somewhere else tonight and couldn't find a parking place so ended up here. I agree with Lady W that this place deserves a higher rating - the food is good! It may be a bit crowded, not to clean (especially the bathroom), and the parking is hard to find on Sundays, but the service is curteous, fast, and, did I mention?, the food is good. This place is awesome. Clean, fast service and best of all fresh and yummy good. I only go for dim sum and I have to say it's my tops in Seattle !
## 4 Sun Ya = zzzzzzzzzzzThere is absolutely no variety here. I think we got some hum bao, shrimp dumplings, peking duck and mango pudding, but that's about it. The food itself was decent quality despite being cold, but I was really bored and hungry waiting around for a cart or some service. The service was also pretty rude here and I felt very ignored.I guess if you like peking duck, this is the place to go, but don't expect much else here. I used eat here with my family for dim sum all the time when I was young, but I recently tried it out again a few times recently and found that the food hasn't been very good. It's just not as good as it used to be. But that doesn't mean it's bad. Actually, I still recommend it if you're in Chinatown and looking for some pretty decent dim sum. Although the quality of the food has declined in the years I still enjoy it! It's great to go with family too. So you should try it out whoever is reading this! The end. I have to admit that i have been spoiled with great dim sum. Trips to hong kong, Vancouver, Monterey park and living in SF. This place has A minimal dim sum selection. They didn't have woo gok! (fried taro puff) or xiao lung bao. hello! Wth?? Cart ladies were friendly enough. They spoke english and one was even young and fluent. Service was fine. We weren't really motivated to eat much at all. Had the har gow (shrimp dumpling) which was decent, shrimp in rice roll (not quite up to par but edible, wrap was not soft) and chicken feet (nothing to write home or yelp about).  the diet coke was so watered down, there was no flavor or caramel color. What a joke. How do you screw up a diet coke? 1/2 star penalty for the soda brings us to 1.5 starsWe really really tried to eat more but even being almost 5 months pregnant, not much looked appealing ( and i can ALWAYS EAT). Total bill $11.40. Wow, that's a an all-time lowI shoulda listened to my fellow yelpers than take the advice of our Asian concierge dude at the westin. Lesson learned. Seated quickly for a large group.  The carts came around quickly.  Here is where I will probably get my chinese card revoked though.  It didn't look that good. It also didn't taste that good.  =( And I agree with other reviewers, the cart ladies didn't know enough english to communicate to their patrons, so they came off as abrupt, even rude.  This became an issue when one of our party wanted a diet coke.  Finally management brought a coke, and it was so watered down it was more "diet" than expected. Had a hankering for almond boneless chicken with gravy but was not willing to drive over I-90 to South China, which has the best ABC. A friend had suggested Sun Ya, which we have visited several times for dim sum and big Chinese banquets.  Sat in the bar and ordered the ABC, soy sauce chow mein w BBQ pork, and asked what kind of vegetables they had.  Ordered the pea vines with some beef, figuring that you were supposed to order a meat with it.  The bill seemed a little high and we then noticed that the pea vines were $14.95!!! Asked if that was right, and she said it was a dollar more because of the beef.  My god - who would pay $13.95 for pea vines without meat??  Got all the stuff home, after going back in for the forgotten rice, and unpacked it all.  The chicken, was like a beige color and not at all golden brown. The saving grace was the breast meat.  Almonds were sitting atop the gravy, which was only a slightly darker shade of beige.  It was good, not great, and definitely did not measure up to South China's. The chow mein was good, but a little skimpy on the meat. Those expensive pea vines? Good, but I'm not sure why Chinese restaurants do that weird thing with baking soda to their meat.  It makes it soft and squishy. I'd rather eat a tougher piece of beef than a spongy chunk.  Next time I'll have to ask my favorite waiter Philip what/how to order...
## 5 Dim Sum for lunch on a drizzly day in Seattle.  'Guarded parking lot' mostly to keep out non patrons from parking.Seated in a booth right away and dim sum carts came by quickly.Noted that the servers were much more friendly by saying 'enjoy' after serving and making recommendations, so good stars for service.All in all, hot dim sum, good service and tasty, no complaints.Priced reasonably too. Okay look....I'm not a super big Dim Sum expert, but I know when its good, so I'll keep this short.Sun-Ya sucks. My lady and I popped in for lunch when we were running errands in the ID. The place is huge, and has a lot of good seating, but then you have to deal with the wait-staff.The gals pushing around the dim sum carts were pretty much terrifying. If you didn't know EXACTLY what you wanted when they arrived, you got scowled at, cursed to in Chinese, and then they simply bailed out on you.The dim sum was less than pleasing too. Everything was fairly rubbery and old tasting.  Seattle has a BOUNTY of awesome Asian cuisine. Save yourself from a bad experience and try somewhere else in the ID for dim sum before you go here. This was the worst dim sum experience I have ever had!We went there by recommendation from Nancy Leson, the Seattle Times food writer and found ourselves very disappointed. The food was only so-so as far as dim sum in Seattle goes. Some items were supposed to be served warm but were served cold. The waitresses were shouting at each other over the customers' heads. We were short 1 tea cup and it took asking 4 times (including the restaurant owner) to get it to us. When the cup arrived, it was dirty! I would like to say something nice but I can't think of anything. Yes, there is parking but good luck trying to find a spot. There are many choices for dim sum in the international district in Seattle, skip this one. I wouldn't go there if it were free! What the Hell, people. This place has perfectly acceptable dim sum, and it's cleaner than most places I've been in Vancouver, New York, or Hong Kong. Double happiness. Four stars. Seventeen black holes. Avogadro's number of carbon molecules. I hereby declare this place to be tripleplus good, and detractors can bite my shiny metal ass. Oh no! There are a million people waiting outside Jade Garden! I didn't know so many people had Veteran's Day off. What now??!?!?! I don't know about you, but once I've got dim sum on my mind, there's no stopping me! The BF and I were wandering the streets of the ID for a restaurant that A) wasn't crazy busy like Jade Garden B) not totally empty, eek! C) serving dim sum Luckily, we came across Sun Ya and only waited about 5 minutes for a table. The service was friendly and the food was good. It wasn't my favorite, but I left satisfied and full! The shu mai was especially delicious! I'd come back again. Sun Ya is not the best restaurant in the I.D., nor is it the cheapest. In fact, the best way to describe the place is, "pretty average".What distingushes Sun Ya, and earns it a fourth star, is the consistent (if unremarkable) food quality, the better than average service, and free parking.The free parking, in particular, is a joy... if only because of the crabby old parking attendent. While he may be grouchy, and his English is imperfect, he can be an absolute charmer with old ladies and young children. I don't know why there are so many bad reviews, this place is definitely a 4-star dim sum joint.  Good service.  Lots of tables.  Many dim sum selections to choose from.  And of course, inexpensive. Friendly service and well executed and fresh dim sum makes Sun Ya a first choice for dim sum.  Slightly broader choices are also a welcome change from the standards.  Best dim sum outside of Vancouver. Sun Ya is a very poor representation of how good Chinese food can taste. In fact, I don't think I've ever felt so disappointed with Chinese food in my life.My family and I came here last week for a small, private dinner banquet to celebrate the birth of a cousin's cousin. I'd been to Sun Ya a couple of times when I was a kid, and I had nothing against the food then, but oh how ignorant kids can be...*shakes head*The restaurant itself is spacious, perfect for larger gatherings and easy for dim sum carts to navigate. There's even a parking lot outside, which makes this restaurant very attractive for those who are too lazy to find street parking. The service is friendly, too. However, that's about all I can remember of the good things.First off, why were the walls dirty? I'm sure that decades ago the walls must have been white, but they sure haven't been cleaned in a while. Second, why were there no turntables on any of the tables? When the tradition is family-style meals and there's usually eight to ten courses on the table at a time, it is ESSENTIAL to have the turntable. No one wants to lean over the table to pick up food or have to keep shuffling dishes around. Third, there is something very wrong when you see lip smudges on the glasses. Isn't it a requirement for everything to be thoroughly washed before placing it back on the table?As for the food...boy, don't even get me started. The appetizer plate (jellyfish with assorted meats) was not cold, but lukewarm. Jellyfish is best served chilled, but it felt like the dish had been sitting on the counter for a while. Next came the standard shark fin soup, which was lukewarm and seriously lacked in flavor. I dumped plenty of pepper in to try and liven things up, but even that couldn't save it. Then came the duck in small buns, which was actually alright, but the buns looked oddly deformed, lumpy instead of smooth. The walnut prawns weren't bad, either. Our steamed fish was huge, but because of its size (or maybe it was overcooked), the meat was tough and dry. The beef and broccoli dish reminded me of a chewier McDonald's burger, while the fried rice, crab, and lobster were only average. There was another vegetable dish - spinach, I believe - but I didn't try that because of my vendetta against the leafy greens. Also surprising was that no rice was served. I know some restaurants ask that you order it separately, but they should really start including it with the banquets. It simply feels wrong to be eating Chinese food without any rice.After we finished our meal, I noticed a very bare fish tank in the corner of the room. When I stepped closer to examine it, I saw what looked to be a few goldfish and some small koi. Nothing else was in the tank except for the fish: no gravel, no fake coral, no bright tank decorations. These poor fish looked so sad and neglected, it was depressing. If you're looking for dim sum quickly in Chinatown without all the hassle of waiting in line to be seated, this is your place. But be sure to know that the reason why people wait at the other places is because the food is better there. After getting that out of the way, Sun Ya is great for what it is.They have super nice folks who make the experience really welcoming. But when it came to the food -- it was just meh. We got a real good assortment of humbao, shumai, shrimp balls, rice noodles, bean curds, sticky rice, etc. They were all pretty ok. Not great like the food at Jade Garden or Top Gun in Factoria.Would I go here again? Probably. Only if there 1 hour before a Seahawks game and I'm hungry and wanted to go to dim sum. This place is perfect for that. Not the best food, but not the worst either. I love the variety of dim sum they have and I always go here for that REALLY yummy calamari. The steamed dumplings are always in abundance and I really love the fried pork belly. However, I recently went there and I feel like the dishes were a little cold and weren't the tastiest. Maybe it was just that day. But really when it comes down to it (this will sound bad- don't judge me!) but I really like coming here because they always have parking, seating, and a lot of food to choose from. Convenience is key when it comes to my hunger and for this reason, I go to Sun Ya.
## 6 I'm just your unsophisticated, limited-travel white-guy--but I like dim sum and I like Sun Ya's dim sum. There's a bigger place with more parking down the street, but? that place is a leeetle dingy, like with old food on the floor, so then i worry what the kitchen looks like. Sun Ya is clean everywhere, including the bathroom--which is my criteria for Chinese restaurants. And, Sun Ya has lots of carts with lots of choices. Now that I think of it--the other place had deep-fried chicken feet but I don't think Sun Ya did. So I do visit the other place maybe 1 out of 5 times. The rest is Sun Ya.And am I surrounded by other round-eyes when I go to Sun Ya? Naw, I'd say the customers are about 80%+ Asian. Very Asian. Now maybe they are 5th generation Koreans from Bellevue--how would I know? Surprised by the crummy reviews - the only thing I can think of is that things have changed (there are some dates ~3 yrs old on some of the negative reviews).I've been coming here on and off for over a year now and would recommend it to any friends looking for dim sum in the ID.  A Chinese friend recommended it and now a group of us make it an ongoing occasion - I usually make it down for dim sum every other month or so (I try to avoid busy weekends during football or baseball games).  I have always had great service (like the server who notices I empty my water glass a lot-- brings me a pitcher!), a friendly waitstaff who help with special requests (like when they don't have the kongee I love), and very reasonable prices.  The steamed hum bao has always been fluffy and flavorful and the ha gau has been tasty & fresh.  Our large groups usually come out of there stuffed - for about $10-12pp!I've never noticed any insects or other dirty areas; the place is a little shabby on the inside, but I haven't ever seen it untidy or dirty. brought mom here for dinner and we had the singapore chow mein and honey walnut prawns.  food was very bland :(  i used to go here 9 years ago and i remember it being THE place to get your Chinese food on but not so much anymore.  not sure about their dim sum...surely it has to be better than dinner.  staff were nice, place was clean, although bathroom could've been scrubbed a little more. sun ya is probably one of the oldest restaurants in seattle's chinatown.  i remember coming here when i was a wee tike with my parents for dinners, parties and dim sum.  although they did do a remodel a couple of years ago, stepping into this place, you feel like you went back in time almost 40 years ago.  this place reminds me of going back in time to the 70's.  the restrooms are kind of creepy, maybe because it's a little dim in there, it seems like someone's gonna jump out of the stall and jump you, hah hah.  ok, so enough bashing of the ambiance and atmosphere, let's get to the food.  dim sum here really isn't bad, it's always piping hot and the prices are very favorable.  however, the variety is slim, pretty much most of the dim sum items are dim sum staples you can find at any dim sum restaurant; that's a shame because the food they do have is good.  the service is pretty good for a chinese place, your tea and water are almost always topped off, a big plus.  they do have a parking lot, but it is almost always full, you are better off finding street parking.  so, to cap it all off, the food is good, variety is lacking, service is good, parking can be difficult, and the ambiance?  let's just say they need to do a major overhaul of the whole place. this place used to rock my world with their ginger and green onion dungeness crab, now the foods, including the crab are not as good as it use to be. The only time I dine here now is only for dim sum, not that it's the best in the city, but Sun ya Dim sum is not bad, sun ya had good parking, and Sun ya can accommodate quite a large group of people. and the chicken feet is tasty as well I am among the reviewers here who are surprised about the bad press on Yelp about this spot. This place is really not terrible, I mean it! The servers were attentive, informative and very friendly. The food was hot and even more importantly, fresh. We ordered shrimp ball wraps, shrimp cakes, eggplant, calamari and coconut filled pastries for dessert. The cashier/host was very nice and all in all I thought it was not bad being around 20 bucks for my friend and I. They have a bar in the back area that we didn't venture into and they also feature a private parking lot- which we didn't use as we were unaware of it. Maybe they changed owners or put us in the good seating or something, because I didn't notice it being gross or unclean in there whatsoever. The silverware, glasses, and table were decent and clean. Maybe they got a new manager. this place has two identities: you got the huge chinese banquet style with the huge round table and booths. then you got the other room: the bar lounge. i like em both but i guess i was a little more in love with the bar lounge. it felt so retro and has that dive feel where u can eat and not be bothered. i can get a drink, eat some good chinese food late at night, and watch tv? i'm a happy person.they've got some huge crabs there. i haven't tried them though. it's good chinese food, not amazing but your standard good. walnut shrimp, pork chops, egg dishes, noodles - yummy.and they have a parking lot! in chinatown with lack of parking this makes it so much more appealing. We went to Sun Ya's as a group of 12 during lunch, we ate dim sum off from the ample selection off of the carts.  The service was very good, friendly and accommodating. All of the dishes (chow mien, fried rice, mongolian beef, prawns, wanton soup) were very good. I'm a fan of their baked hum bow.  I bought a dozen there one morning while I was out on an errand it was cheap!  $9.50 for a dozen and they were still warm.  I like me a dozen hum bow.  I also like me some cheap, delicious hum bow especially!  Get some! Food is good and place is ratty.  It's in Chinatown so what more can you expect.  I was here for a special banquet and the service was nice, food was just okay and the place, well, needs a remodel.  A funny thing that happens is the parking lot guy comes in and hollers out a license plate name like you are playing bingo.  If your plate gets called, you get to move your car.  That how they fit as many cars as they do in their tiny lot.  At least they provide free parking.  Not sure if I will go back but since we know the owner, it may happen. AWESOME!  We've now eaten here 2x.  Everyone loves it . . . i guess that's the highest praise.  That includes my kids, my brother and his kids, and my parents.  The nor mai gai are super duper awesome, the siu mai wonderful, my youngest son ate about 100 hargow.  awesome.  everything is hot, fresh and delicious.  they brought me my oolong tea, and everything else we wanted.  2 types of hot sauce.  what more can i say?  also, the super old maitre d' is a good host as well. I used to want to give this place 1-2 stars... because their Chinese food is overly americanized for my taste.  However, after trying it again this weekend I'm giving this place a little more credit, 'cause the King Crabs were really good and affordable! At less than $10 a pound it's probably the cheapest in Chinatown area, and they cook it in several ways, steamed, fried, garlic... Oh, and you gotta try the alligator soup, taste like chicken :) 3 stars for free parking and the big crabs. This is the best dim sum in the International District, hands down. My husband, mother-in-law and I have gone to every other restaurant down there over the years, and Sun Ya is consistently the best. We just had it this morning, in fact.One thing I do recommend; get there between 10:30 and 11, otherwise there's no parking. Wow - this place sure has acquired some bad reviews here. I can't really fathom why. I have eaten here many times, and have never had a bad experience. The carts are always numerous, it is always busy and the food is always decent. Some of my families faves are the congee, the fried calamari with diced jalapeno, the chinese broccoli with oyster sauce and the steamed shrimp rolls.  Is it the best? No, but it is certainly worth a try. The fact that it has parking in Chinatown is an added bonus.I must reply to some of the specific "complaints" form those who have given this place one star reviews:"There are too many white people": Really? Ignoring the fact that that borders on racist, it simply isn't true. In my experience, the vast majority of patrons are asian families, many conversing in various Chinese dialects."Sun Ya is infested with roaches": While I have never seen one there, I don't doubt that other people have. IT'S IN CHINATOWN. Every restaurant in Chinatown has roaches. And rats. Trust me.
## dirty
## 1 1
## 97 1
## 211 1
## 214 1
## 219 1
## 252 1
## review
## 1 I'm only giving this place 5 stars for the severe nostalgia it invoked when I walked in the door.  Growing up as a vegetarian in San Jose, CA, the epicenter of the Vietnamese community in Northern California, my family and I quickly leaned on the support of the local tofu deli and fell in love with the variety of foods they offered.This place stands up to the competition down South, I believe.  The offer up fresh foods, like red curry, stuffed bitter melon, stir fried mock fish (mm...), bean curd egg rolls, regular egg rolls, a variety of bean/legume based desserts, and fresh tofu and soy milk!   ahh!  out of control.  A vegan's paradise!  The Banh mi is also pretty delicious and I think 2.75 a pop, however prices are not listed and that's my estimate from a breakdown of a bill of $19 for a ton of food.Caution for Vegans though - Banh Mi traditionally has mayonnaise, so just ask for a sandwich without any and you're good to go. I LOVE THIS PLACE! 5 STARS!!Let me start by saying this deli is ALL VEGAN!  Being a vegetarian white person who doesn't know any Vietnamese, it's awesome and reassuring to go into a Vietnamese deli and be able to order anything and know it doesn't have hidden animal products lurking within!I love ChuMinh's tofu and have been eating it for years, so I'm very excited that they finally opened a deli serving tofu-laden vegan dishes!I've even gotten my meat-eating boyfriend addicted to this place.  We usually order the 2 entree dinner combo ($6) with red rice (super delicious!!) and we pick 2 entrees that look good that day.  So far I've had the mixed vegetables, tofu curry, tofu and greens, and a special tofu thing with pineapple and mushrooms and vegetables that they had made as a one-day experiment.  We also order banh mi ($2.75) to save for lunch the next day.  My favorite is the spicy lemongrass tofu.  Last time we went on a weekday at about 7:30pm (they close at 8) and they were out of lemongrass tofu so we got onion instead which I was happy with.  I've also gotten the tofu spring rolls ($2) which were very delicious!  The $6 dinner combo is easily enough food for 2-3 people as it comes in a full-size compostable take-out container with generous servings.  This place is pretty tiny and they have one dining table in the middle.  It's best to get your food to-go and pay with cash.  They take credit cards (with a $0.50 charge for transactions under $5) but I think their machine is very slow and/or doesn't work sometimes.  I tried to pay with a card once and she said the machine wasn't working properly so I ended up paying with cash instead.  Taxes are built into the prices, and be sure to throw some cash into the tip jar. Good food! Tasty! Good value. Good for you. Small hole in the wall tofu cafe I was surprised how good it was. Different and interesting dishes.Been there twice now!! Best vegan bánh mì I've ever had in a deli/restaurant.  $2.75 with hearty serving of tofu.  Great! I recently popped in to this place and now I'm obsessed with their red curry.  One lady that helped me last time was really nice, and everything came out SO CHEAP.  I never had to pay anymore than 10$ and they give you a ton, so i've made several meals out of one take out.  good deal! All the dishes were delicious and the people working there were delightful!  Can't wait to tell all my friends about one of my new favorite vegetarian spots. Super awesome tofu banhmi; they let me try it with the chili, lemongrass, and green onion all in one and it was great. Someone else described the ladies working there as your "favorite Vietnamese aunt" - never had one, but that's how I'd imagine them to be. Seems you can't go wrong with ordering the stuff there, it all looks really good. The lemongrass tofu's so good I like eating it straight, and the curry tofu dish I had was pretty tasty. Want to go back and try their veggie pho. I love this place.  My wife and I eat here all the time
## 97 Great food, great atmosphere, great view of the water. Highly advise on getting the meatloaf, fish n chips is good. Breakfast is yummy. Good service, Family Friendly, and cool location. Love Paddys! This place was barely A-OK.  It's a big tourist trap, situated on the water and next door the Victoria Clipper terminal.  We had lunch, which was average at best:  an over-priced steak sandwich and an over-priced reuben sandwich.  Nothing special, nothing horrible either.  In other words, pretty blah. My husband and I walked down to Paddy Coyne's for dinner recently. We got seated and there were two other tables of diners in the place that we could see - one table inside and one outside - yet no one spoke to us or brought us water for at least 10 minutes.  We literally waved a waitress down and she acted surprised about it, which was confusing because one of the other sets of diners was sitting RIGHT next to us and she helped them, so there's no way she didn't see us. There was a waiter standing next to the bar the whole time and he wasn't doing anything (besides watching TV and talking to the bartender), so it's unclear why he ignored us also. Once the waitress came over, the service was ok from her, but bad service is SO frustrating, especially when HARDLY ANYONE ELSE IS THERE.  Clearly they saw us.  And chose to ignore us.  Why?!  The food wasn't anything special.  I had a salad with chicken and my husband had the colcannon, which is a bunch of stuff mixed together with mashed potatoes.  It looked unfortunate but tasted ok.  Turns out something must have been wrong with my salad, because I spent the night barfing it up.  Gross, I know...trust me.
## 211 Drinks:  Very, very good.  Food:  Good, not greatAtmosphere:  Pretty niceVibe:  Pretentious; but I'm an old Capitol Hill fart, not an scenester so maybe it's me.Service:  Adequate, not particularly good or friendlyThe whole speakeasy thing is just stupid - I hope this trend ends soon. I had to redo this since they got written up a one of the best bars in America.  The place is always packed with Yuppies now.  I know it's not the bar's fault for being awesome, but it was a stripey button-down shirts and spiked heels meat market on a Friday night.  Hey foodies: don't go on the weekends. One of my favorite Manhattans in town (tied with Still Liquor). All the bartenders know what they're doing - tell them what you like, and they can improvise from there. Or go with your standards and they'll do you right. The food is generally excellent. The fried chicken is not for the squeamish - it looks like they undercook it a bit. But I've had it, and I haven't been ill, so I guess they got that right too. Go early or go late - this place gets way crowded, way fast. Tasty Moscow Mule and a traditional whisky sour with egg white in it - yum!! Nice decor and theme, too. I really like that the cocktails have the year of their invention listed next to them on the menu - cool! Fancy cocktails always have amazed me, but usually just in the "WTF-Expensive!" way. However, there was something intriguing about all the different kinds of libations available here. I might be behind the times, but put an egg in my martini? Well, I did miss breakfast. All in all, I enjoyed all the drinks I tried (it was a big group of us) and wouldn't mind copping-a-squat some evening to try more. But I'll make sure that I've got my pretty-pennies to pay for the bill. Tavern Law's gotta have the best bartenders in town! I never got to the menu card for drinks because I so enjoyed the drinks the bartender made up for me! On top of that, the decor is elegant and relaxed at the same time, making me want to spend time there. Food-wise, I only tasted the fingerling potatoes (but the foie gras and the salad looked appetizing), and they were alright, but nothing to faint over, especially having just been food feasting at Quinn's a week earlier. But the foie gras and the salad looked appetizing. I'll definitely be back! My favorite place to begin and/or end an evening...Every time I have been here, I've been treated well & felt welcome. Enjoy! This isn't going to be one of those "how have I never reviewed this place yet" sort of reviews.The truth is I've held back because I still haven't eaten here, it seems my consumption of consumables here falls more along the lines of spirits. With that said, this isn't your average pub.What I love the most about this bar is their attention to detail and dedication to quality drinks. I'm usually a "double Kettel One on the rocks" kinda guy at most bars, mostly because how can you mess that up right (see my AZUL review to find out how). Anyway, here at Tavern Law I can be confident that the bartender knows his stuff. The proof is in the pudding, I have never had a bad drink here, and hands down this bar has THE best Floradora in the city (don't know what it is? try it!). The use of fresh squeezed juices, house made syrups and tinctures, coupled with brilliantly executed traditional techniques, makes for an amazing night out at the bar.As far as I am concerned, I don't even NEED to try the food here yet to write my review (thought I most likely will). The drinks, the bartenders, and the impeccable service from the super friendly staff make up a super solid 5 star establishment.5 gin-limejuice-raspberrysyrup-gingerale-limewedge stars out of 5 This place is really cool to go with one or two other people.  I like the vibe and the athmosphere.  I didn't order any food for myself, but I did try bites of my friend's food and it was really good.  The only problem is groups.  The place is small.  I made the mistake of having my birthday party there, and there were one or two other birthday parties going on at the same time.. There were so many people crammed into a little space, it was hard to get service, everyone was standing around trying to get into the secret bar upstairs.  Just too many people!!!  I would definitely go back on a weeknight, with no more than 2 other people.  Their drinks are great, and the food is great.  Just don't have a party there.  Cool place.I thought I hated foie gras, but I tried some here and it was delicious!  Though there isn't a large menu, they certainly have some interesting choices, and unusual mixed drinks selections. We came here for drinks. They were quite good, but we not were not blown away by any means. I was a little disappointed in the use of less-than-premium alcohols (at the very least they could use smirnoff as the most basic vodka, but they take it much lower than that). I am curious to check out upstairs though. Some other day I swung by randomly at night to see what it's about. I wasn't too wow'd by it, unfortunately. Minus the deliciously moist burger (the food), the experience was okay. Even their mint julep... ehhh. I'm used to those more like mojitos but this was like just alcohol at the bottom of a numbing-freezing silver cup, a crushed ice sphere "topping" it and the straw & mint poking out.I haven't asked if they can do a Singapore Sling, yet. don't get me wrong.. this might be my 2nd favorite bar in the city.  However, I still haven't gone upstairs, it's pricey, and always crowded.  Great spot to grab drinks.. just tell em what you feel like.  Never tried dinner here tho.  Love this place.  maybe i'll edit my review after I get up there. This is my favorite classy bar on capitol hill, and pretty much in Seattle as I almost live on 12th and pike/pine. ( and will be moving to a few blocks away shortly ). 1. The cocktails are amazing, creative, interesting, and so many to try downstairs I almost would want to go back on a daily basis but that would likely turn me into an alcoholic. 2. The food is unpretentious but upscale- get the burger, its my favorite burger pretty much.....ever. 3. The upstairs bar is by far the best thing of everything. the fact that they make it up on the fly based on the interaction with you and trust me, every drink i have had up there was beyond superb-but whats even better is the atomosphere, the fact that they dont have blaring music, and they limit the number of people who can come up there, so it remains quiet, and enjoyable. This a place to take a date, or a hot non date. This is probably one of my favorite places to get drinks in Cap Hill. They have wide selection of specialty cocktails that are made just right - not too sweet, with all types of liquors. It is a smaller place, so can be hard to get a seat, but love the cozy feeling on a cold Seattle evening - reminds me of some of my favorite NY speakeasies! Amazing cocktails. The end. PS: The barmen at this Capitol Hill oasis are talented, personable, and know how to run a bar. They offer up a huge selection of craft cocktails, tropicals and similar paired up with a delicious gastropub menu with the likes of foie gras, pork belly and similar on the sheets. No matter when you stop in to Tavern law - be it with the professional drinkers in the early evening hours or during amateur hour at prime time on a Friday night, the service and quality of hand-crafted cocktails will be top notch. And if you prefer straight booze, the selection of spirits is deep.PPS: The best time to chill at Tavern is not, as I learned on a recent visit, at 10:30pm on a Friday. It's packed with a huge, loud crowd... most of whom don't seem to care that much about what they're drinking. No, the best time seems to be either evening or an off weeknight. Then you can sit at the bar and have the bartender make you something special (as if the offerings on the printed cocktail menu aren't special). But even amidst the chaos of the 10:30 crowd, our cocktologist was able to mix us up some bartender's choice drinks that were insanely well-balanced and tasty. PPPS: This is seriously one of the best cocktail bars I've been to in the PacNW. Haven't had the drinks, only the food.  Solid food and while I'm much more partial foie gras a la lobe style, I didn't think their mousse-style version (foie gras terrine) was very compelling either.  I've had the burger and chicken here (different occasions) which are solid as well.  Service is always good and consistent - like their style.Haven't had their drinks but will come back and try. Drinks took awhile, but these liquors were rhythmically put together to form a delicious masterpiece.  I still prefer the other hip speakeasy on the hill known as Knee High Stocking.Ordered some desert that took almost a half hour.  Where they freezing the ice cream?  Where they baking the chocolate jello thing?  Who knows, who cares.  One of the servers didn't even know what was on the desert.  Service was good enough, we had a laugh with the other Vietnamese waiter.One thing that did intrigue me was their door with a phone.  People would slowly go in there groups at a time.  Pick up the phone.  Mumble some crap on the phone and someone would open the door.  Apparently it's another speakeasy.  A speakeasy within a speakeasy? Ok...  If you share the same kitchen and make the same drinks in both areas, yet you as the patron, prefer to go up stairs to the 20 seated attic, then you must be looking for exclusivity.  Could care less.  Long story short: Enjoyed the drinks, but not enough to come back. Intimate bar with French 75s on special. I'm sure they can do more, but this was all I needed for a fun evening with colleagues. Gulp, gulp! Fantastic, bitters-orriented cocktails and solid gastropub fare. Tough to get a spot, so arrive early and stay late.*If so adventurous, tell the bartender your favorite liquor and let him whip-up some non-menue concoction up for you--more than likely you will not be disappointed. We went in around 9pm after the Capitol Hill Block party. There were about 4 people there until the place started filling up around 10pm. The décor reminds me of a law library, but it was very comfortable and clean. The space is small with a "secret" bar upstairs with limited seating. The bartender said they fit up to about 300 people on a weekend night. It looks like there's seating for about 40 people. The menu has about 10 small plate items around the $10-$15 range. The extensive drink menu had mainly cocktails with a small list of wine and beer. Happy hour prices were $7 a drink with regular prices being around the $9-$12 range. The lighting was dim with candles at every table. There were benches and chairs with leather seats. Overall, it was a very pleasant experience with very good drinks. I enjoyed this place even though as the designated driver I stuck to non-alcoholic things. The mixologists are fun to watch and affable.Washrooms are pathetic - two private ones for the whole place and one was out of order.Cramped, but nice atmosphere. First off I'd like to say that this a review for the drinks only as I didn't have the opportunity to try the food. I came here on a Saturday night after dinner and it was awesome. The bartenders definitely know what they are doing here. Just describe to them what you feel like drinking or what your mood is and BOOM! drink in yo' hand. The prices were a little steep for me, but I would say well worth it. I don't think there was one drink that I didn't like. I'll be back to try more. Tavern Law was a lot of fun for me when it opened, but since then it has gradually lost some of its luster in my book. Maybe the crowd has changed and that's colored my experiences, but I remember having a lot more contact and connection with the staff. Lately it seems like there is more of a focus on efficiently getting orders out. I understand this is good for turnover and keeping thirsty patrons happy, but I prefer to take my time and don't mind if the bartender does too. Although it seemed fun in the past, at this point the decor and formal costumes worn by the staff seems a little Disney, a little staged. I guess in some ways that pretense adds to the charm and mystery of the place, but I like places that keep it real and don't let a theme get in the way of the human connections in a place. My favorite cocktail bar in Seattle.  They serve very sophisticated cocktails and pay attention to every little detail.  My go-to drink is heart burn but all drinks that I tried, I loved all of them.  Make sure to go there early like before 8pm.  Otherwise it would be completely packed.  If you can, make a reservation for the secret second floor.  You will be surprised by their cute little space upstairs. The food is seriously delicious. The upstairs is seriously awesome. But the service upstairs SERIOUSLY sucks. We made a reservation for a friend's birthday upstairs. When we called on the little phone downstairs to see if we could come up they said they were running a little behind and it would be a few minutes. No big deal. 30 minutes after our reservation they let us come up, but nobody came by to give us menus or offer a drink for another 30 minutes. We had a schedule and plans for the rest of the evening, so we asked for food, which came as a big surprise to our server (?). The food was really good but came at all different times, with a side of bad attitude. They also pride themselves on their cocktails. I asked for a "vodka surprise" thinking their ever-so-skilled craft bartenders could make me something worth marveling over. I was given a cosmo. Woopdie doo. The rest of my friends drinks either tasted like licorice, or were heavy handed in egg white froth and bitters. Also, the drinks were super expensive.The food was great, but boy were we thrilled to leave. Umm... the fried chicken here was AMAZING... Seriously, it was probably the best I've ever had. They definitely poured a little honey over the chicken which made it super bomb. I'm going to start doing that from now on. And the mashed potatoes were so smooth and tasty. No idea how many calories were in that meal but it was so worth it.I thought the drinks were OK. I ordered the signature drink which was the first on the happy hour menu and it was pretty tiny and tasted ordinary to me. I guess I have to try the other cocktails but they're so pricey I'm not sure if I'd even want to spend that much during happy hour! Well drinks are $4 during happy hour which was decent.I'd be down to come back and try the food and check out the upstairs' Needle and Thread, other than that, I'd have to be really intoxicated and generous to myself to spend more money for fancy drinks. Honestly, I have been confusing this bar with the newly opened Tyler Florence restaurant in San Francisco. I know I'm such a dork. But this place is awesome. The small plates are delicious. We had the duck fat Mac and cheese and the oxtail gnocchi. The duck fat Mac and cheese was greater than I had expected. It was like eating morsels of duck with Mac and cheese only it's imaginary duck. That's how flavorful this dish was. It was accompanied and adorned with some greens and a thinly sliced granny apple slaw. This complimented the dish really well. The oxtail gnocchi was also pretty good. The oxtail was simmered with lovely spices to a perfect tenderness. However, the gnocchi seemed a bit overlooked in my opinion. I like my gnocchi to taste like pillows of velvet in my mouth. Unfortunately, the gnocchi was a dry and a bit crunchy even with the oxtail sauce. The drinks are off the hook. All of the concoctions are masterfully creative and strong. Nothing tasted like juice. The whole place is very crowded and loud. There is so much going on that one can be distracted. I mean a girl at the table next to us was literally in our conversation just because I mentioned a scene from the tv show "New Girl". It was funny because she was excited as I was about the scene I had described to my companions. I love the decor with the books and such. It made me think that tavern law was opened just for the Seattle U law students just a block away. The service is impeccable for such a busy place. This is definitely a place to return to if I'm willing to wait for a table with friends. This is a pretty cool place. Cool mixologists for sure and they give you exactly what you wanted! Only thing is, its pretty expensive. They are great drinks but I only find myself ordering one....if there is someone else with you, it gets really pricey. Just 2 drinks nearly comes out to $40. Their macaroni is AWESOME. Again, its $13 mac, and its tiny...but so delicious! It gets crowded but it comes in waves. I will be back again, just after I get paid! The food was stellar.  My party and I tried almost every major entree on the menu, except for the fried chicken (which was out).  The burger with the pork belly was amazing, though a little overcooked.  The mac & cheese was excellent and creamy.  The fried trout with risotto was delicious.  The drinks were excellent, though there's some room for improvement.  The mixed drinks the rest of my party had were better than mine (but I ordered an old-fashioned cocktail off-menu).  I'd go back here anytime. Food was excellent.  Limited Menu, but its all about the drinks. If you are in the mood for something unique and totally out of the norm, you can find it at Tavern Law! The drinks are unlike anything you will find anywhere in Seattle. its an old speakeasy that really plays the part. Go off the menu.. tell the bartender what you like and he will not disappoint you!!Pssstt... also ask the bartender about the secret vault door..  ;) (Needle & Thread and Tavern Law are listed separately so I reviewed them separately. I gave N & T a much lower rating.) I had read many great reviews on the drinks and food as well as the ambiance at Tavern Law so I decided to check it out and drag my husband along for the ride. I was not disappointed (at least with Tavern Law...) There is a bartender there who introduced himself as Magic who makes a lovely Ober Park Fizz. He was also very patient and answered my husband's questions on what Sazerac is and he even let us taste the perfect Rye to use and then discussed the undertones and flavors in the Rye that make it good to use in the drink. Side Note: My husband and I are in a weird transition period from late 20's to early 30's where we have children and our friends do not so they still lead the "party hard" lifestyle that we are no longer interested in. Hell, we never were. I have been trying to find new places to go without ending up either in a group of newly 21 year olds in some club scene or with people my parents age. This place totally fits that bill! There were people of all ages here but no one cared and it was really pleasant. There was one guy who had to many drinks but thankfully he left and that wasn't the norm. I visited Tavern Law for the first time a few weeks ago. I'd heard so many good things about the bartenders and drinks that I may have set my expectations a bit too high. Tavern Law exceeded those expectations on every level.It was a Sunday night, so the bar wasn't packed and everyone who worked there seemed to be in a great mood. The bartender (I wish I could remember his name!!) was absolutely wonderful and helped me find something I liked while trying something different. I felt very taken care of and I loved the atmosphere. The drinks are on the pricer side, but are well worth it considering the effort and quality of product that goes into crafting their cocktails.Will definitely be back! I was at Tavern Law on a Sunday evening.  The tables were empty but the bar around half full.  I had two drinks: Oaxacan something and and Old Cuban.  Both were excellent, though the pours were a little small.Tavern Law is pretty darn good.  The one shortcoming was, though the bartender mixed a good drink, he didn't add a lot to the experience.  In my opinion, a top-notch bartender provides also provides conversation and brings the crowd together.  The guys at Bathtub Gin are fantastic at that; I've ended up with three new friends every time I've gone there.  I won't hold it too much against Tavern Law because it was a slow night, but that is the difference between four and five stars. Loved this place. The perfect bar; great cocktails, cool atmosphere, awesome late-night food menu! We went around 11pm on a weeknight and got a salad, mac n' cheese, and truffle fries. The salad and mac n' cheese were bomb. I would be a regular here if I lived Seattle! The crowd seemed cool too; like hipsters who look like young lawyers (but are cooler than that) :) I've never had a cocktail here that was less than amazing.That being said, many of the times I've tried to come here, it's been too crowded. Even the few times I did manage to get a seat, I've gotten crashed into by buzzed and bewigged barhoppers trying to get into Needle & Thread (the "secret" bar upstairs). The only time this didn't happen was when I sat outside.If you come here, prepare to either: stand, have a seat in a far corner, or have a lot of people's butts rub against you awkwardly. You'll still get a great cocktail. I adore Tavern Law, but last night's experience was too much of a mixed bag to leave as is. First, the mixed drinks there are outstanding. My friend had a Sidecar and said it was one of the finest she'd ever had. My date ordered his favorite drink, a Sazerac, and it was stunning. My cocktail was a delight. And we all had the burger. We all felt it was maybe the best burger we've had in a long, long time. The conversation went something like this: "What is the sauce on this burger? It's amazing. It's like the extract of angels. It's like magical unicorn blood!"I hope the proprietors read this as there are two issues I think need fine tuning.1. The french fries are too salty. I like salt. I love salt. For it to be TOO salty means it's way too salty. Leave the oversalting to those at the table, and leave it out of the kitchen.2. The music playing was horrible. We all felt like we were trapped in a small car on a very long roadtrip with our parents. They played Boston, Journey, Foreigner. For a classy joint outfitted with beautiful Depression era antiques and upholstered walls, I would have preferred something era-appropriate. A little swing music? Some jazz? Big band? Even a quiet hum from the fridges in the back would have been preferred. We'll absolutely go back and we'll order the burger again, but we're going to need extra Sidecars to wash down the fries if they are served that salty again. We went for the wife's birthday. We're new to Seattle and didn't really know that Seattle has a speakeasy fetish. We're from Chicago where we just sit in prohibition era bars and drink.  That said, the decor wasn't offensive, the place looks nice enough with lots of wood and books that make me feel comfortable. We really wanted to sit at the bar but the few patrons there were all spread out and could have been condensed to let a few others enjoy the bar seating. Instead of asking people to move, we're all about practicing Seattle civility and sat at a table.  We found a seat and were brought some water by guy with attitude. Ok. sure no problem. Guy is a dick. big deal. We're here to have some drinks and have some fun. Forget it and relax. We start to look at the extensive menu and waiter boy comes scooting over and hovers over my shoulder. He insists that we stop looking at the menu and tell him what we like. We really just sat down and could use a minute to chill and catch our breath (we rode our bikes there and were still recovering from the hills- chicago flatlanders). But he insists it's more fun without the menu. So, I indulge. I give him a few kew words and then he starts naming drinks from the menu that I was supposed to close. It was like what? Sorry guy, don't have hyper photographic memory. We shoed him away and he came back for our order in a few minutes. Drinks were small, expensive and complicated. My drinks really lacked a smoothness or a completeness. A little tasty and a little medicinal but never really "there." Maybe it was the sunday bartender? The tiki drink was really good, maybe that's why the charge even more for tiki? The food was tiny. Like expensive yuppie tapas.Overall, wouldn't really go back here unless it was to watch the place burn down. Our waiter was ok. He has mastered the art of being a condescending prick while expecting a big tip. He told us not to take the bike path home because it was dark and dangerous. So we took the bike path home and it was the best part of tavern law. Two friends and I went to grab a drink and while we got to sit where we wanted (outside), it took forever for the host to notice us. He didn't say a word to us when he put the menus on the table. I was looking forward to my drink because the extensive drink menu made me believe Tavern Law does cocktails well, but mine was just ok (Autumn's Folly). I've had much more well-crafted cocktails at a better price. The service got a bit better towards the end of our visit, but I still almost didn't leave a tip.I do admit we enjoyed people-watching the yuppie crowd. For some reason it's refreshing to be around yuppies when you're sick of the Capitol Hill hipster/hippie crowd. While they may be known for their detailed and intense cocktails I found mine to be only so-so. Because the bartenders have to focus so much on each of the intricate drinks they make, they aren't very attentive to the large number of people waiting at the bar as well. Maybe if we had spent more time there and had a few more drinks I would have enjoyed it more. Or maybe if we had made reservations upstairs. Oh well. The decor was very nice though! Amazing drinks freshly prepared friendly bar staff. Best cocktails I have probably ever had overall.Cool inside but tightish for seating, fills up fast.Happy hour all day Sunday. This place is so reminiscent of a bar in the late 1930's-early '40s during the prohibition era. The bartenders are suited up and there's an actual speakeasy to get into an elevator (the door looks like a bank vault) for the "exclusive" floor above. Faux (it looked like) book cases that went from floor to ceiling.There's an extensive bar menu. I decided to order the drink special, some tropical something, that had gin, lime, peach, & bitters. My drink was sooo different than what I was expecting. It was massively spicy! The heat would take effect after a few seconds. The description said nothing about that. I was expecting a bitter/sour taste but not the burning sensation. I would suggest that the staff try their drink specials so they know 1st hand what they taste like. I asked the waiter if it was supposed to taste they way it did, and all he could say was, "I think I may have picked up the wrong drink."So, not only were the drinks expensive, but they got mine wrong. The waiter apologized, re-took my order for another drink, but didn't make any complimentary adjustments to the bill. I will give Tavern Law another chance in respect to their ambience and decor alone, but hopefully I can re-evaulate and increase my stars rating. Service excellent.  Burger/fries, one of the best I've had in Seattle.  Mac n cheese was so/so, drinks were good, but not blast my socks off.  Cool space.  Super hip, including secret upstairs lounge.   I'll be back. Fantastic food, inventive & delicious, inexpensive...Superb gastro-pub food out of one of the smallest kitchens ever. For your first foray into sous-vide cooking you can't go wrong here and you won't drop a ton of money...The cocktails list is extensive and delicious although you can spend some serious dough on drinks--especially if you are upstairs in the Private Bar. The only missteps I have encountered on 4 meals here is occassionally the seasoning was a bit off (a little salty)...but I mentioned it to the cook Ross, and the next time the Fried Chicken was spot-on. Fun spot. I did not like dining upstairs however...trying to have to balance food for 3 Diners/Foodies that love to eat when our table was an awkward ottoman was not good for the back...We ordered tons of food-- almost one of everything off the menu and it all came at once...I would have thought the waiter would have paced it out better. And when asked for another tray to balance dishes on--the waiter brought a tray used for bussing...When asked about which of 2 unfamiliar beers the waiter would recommend-- he said he didn't know cause he doesn't drink beer! (This from one of GQ's Top 25 Bars) Really? I think the (upstairs) waiter needs to study up on his beers. Still, love this place! But in the future I will be sitting downstairs.... This is a cool little bar with a old fashioned vibe.  The quality of the cocktails was the best I had while in Seattle (with a similar vibe as my favorite bar in San Jose).  The real trick is to come early so you can use the little phone and get a place upstairs in the secret speakeasy called Needle and Thread.  We didn't have that opportunity, but maybe next time! My husband hosted my birthday party in their upstairs cool/sneaky speak easy bar. Four stars for:1. Unmatched ambiance2. Quality and taste of cocktails3. Great Bar-tending staff4. Hidden location and private space5. Yummy food6. Availability on a Friday night on one week notice!Minus one star for:1. The event manager who helped my husband schedule the event. She was not very helpful in answering his queries over the phone or email. We wanted to order some food off the catering menu, she kept telling us to show up 30-40 minutes prior to the event and order food for the party of 35 people. When we got there, the server told us we were supposed to order food off the menu days prior  to the event. This was very frustrating and we ended up ordering appetizers off the regular menu that were served in small plates. So the food set up at the event almost looked like it was leftover food sitting on the tables and not meant for our guests. In all fairness the mac and cheese and potato wedges were yummy. Just the set up and coordination with the manager sucked.  Again, the onsite servers were very helpful.Regardless of the unhelpful interaction with the manager, it was a great and memorable birthday party! Amazing drink knowledge! I've only been here twice and I love it! As many of have said, the atmosphere is great, food is delicious and drinks are amazing. The drinks are pretty creative and they usually change up the cocktails and have their regulars. Both times I ordered the burger with pork belly and it is so good! The combination of caramelized onion, pork belly, and burger is amazing. The bun on the burger is just as good and I can just eat it by itself. The bread is soft, buttery and flavorful. Service: the first time I went, our server was great! He was attentive and quick especially when it was quite busy on a weeknight. Unfortunately, I cannot say the same thing about my second visit. I went early on a Tuesday night and there were hardly any patrons. The server took 25 minutes to take our food order after serving our drinks. I had to guzzle done my wine so he could see that my glass was empty to get his attention! If we just wanted to drink, my friend and I would've sat at the bar. When he were done with our meal, he finally cleared our table and my friend asked for the bill. We waited 15 min...and no bill, so my friend actually had to get up and ask the server for the bill!  The service we experienced was unacceptable especially when the restaurant was empty. Instead of paying attention and checking to see if we needed anything, he was busy chatting it up with his coworkers.Even though I experienced bad service, I will still come back because the place is great, food is absolutely amazing and drinks are awesome! I just hope I don't get the crappy server! I hope no one else will have a bad service experience. Executive Summary:Snag some grapefruit gin. Very good stiff drink. Ambiance is what you'd expect from a mega-hyped bar like Tav law. If you are looking for a more chillaxed scene with even better crafted mixes, scope out Sun Liquors near Summit and Mercer. Pros: Go here if you are stumbling by. Cons: Don't go out of your way to make it. After visiting Tavern Law a handful of times now I am quite confident in saying it's my favorite bar. I am not much of a drinker but I always enjoy the quality of the cocktails here. Sure, the drinks are pricey but they are worth it! I also find the service to be a breath of fresh air. No matter how busy it gets I generally feel like my business is still valued and made a priority. The bartenders are as friendly as they are talented and I have been pleasantly surprised by their ability to remember my likes and dislikes. First time here this past Saturday for a Ladies Night.  Service was great, ambiance was classy but not stuffy.  Drink menu is quite large, diverse and exciting, although the drinks weren't as strong as one would expect from the ingredients.  Crowd was mixed as well.  I was most intrigued by the not-so-secret vault door leading to the "secret" speakeasy, the Thread and Needle.  We tried to get a last minute reservation but were turned down.  I actually admire they were sticking to their occupancy limit!  This place takes the art of drinking seriously.We eventually talked our way into the secret entrance for just a quick look about, and it was really neat.  You enter a narrow wooden staircase adorned with 20's pin up girl photos.  The loft-like area has great vintage fixtures, fabric walls, and is just cozy enough without being uncomfortable.  I also liked that it wasn't filled with a bunch of chachi club kids.  The people here were sophisticated and really into the 1920's throwback vibe.  A group was even dressed in pinstriped suits for the guys, flapper dresses with feather headdresses for the ladies.  Very cool.Had some interesting sounding food options on the main floor, but not very veg friendly.  They can do better.  Overall, the Tavern Law is a hip but non-snobby night out.  Recommended! Awesome cocktails that can be described in one word: Vintage.Nice selection of fancy bar style grubs as well. First - this is the current "we don't serve ketchup" place. There's always at least one on Capitol Hill at any given time; ridiculous. The line they give you, when you ask about ketchup, is usually delivered with a healthy dose of snobbery, and an offering of some type of alternative condiment to be served in a tiny tiny dish as a reminder of your silliness for asking. This is how snooty places think they distinguish themselves, and they are right.Had the burger with pork belly based on a recommendation. Acceptable, but by far not the best burger for the steep price. Heartburn later - urgh. Service was just OK, too. Lots of employees walking around, talking, but not so much table service as you'd like (water refills please! etc.) and no friendly banter with our waitress even when we tried hard to engage her.So many other options on the Hill, this one isn't my style at all. I'm way into friendly service and down-to-earth attitudes about food and customers, and this place is neither. Oh yum!  Much like Patterson House in Nashville, which I think I also reviewed, this bar has an unusual drink menu and a lively yet relaxing ambiance.  I wish I took pictures of the drink menu.  I can't remember the name of what I had - something with a raw egg and completely delicious!  For me, this would be a fine night out on its own.  In my partying days, this would be an excellent beginning to a fine night out.Denver restaurateurs - fancy, complicated drinks, low-key atmosphere!  Make one here! The concept and the cocktails are cool, making the trip worthwhile.  The food can be a little haphazard in terms of whether it arrives or not, or if it's what you ordered, but when it did arrive, was delicious.  We had a pork back appetizer, probably no longer on the menu, to die for.   We did not go upstairs. Blood and Thunder with Foie Gras.x2Tavern Law gets a lot of hype, and I admit I kind of avoided it for that reason, and because it sounded a little pretentious, what with the gigantic cocktail menu and the "speakeasy" upstairs. However, I really liked the ambiance and the drinks, which were absolutely as good as their publicity promises, and the foie gras was excellent. Watching the bartenders at work is entertaining and educational, too, fascinating really, and I like to come here to sit and the bar and watch them like a cooking show. Tavern is a great place to impress friends or to spend an hour gawking at the bartender side show, but whatever your reasons, if you like a craft cocktail and fine food, come here. Look, I really can't fault this place.But maybe I should complain about the high prices of their petits plats. Maybe I should bitch and moan about the pretention and other noise that you can experience here. Or perhaps I should point out the limited patience a fickle public has for this whole speakeasy zeitgeist.Well, to hell with that.The cocktails, especially in the speakeasy upper niche, are ...A-fuckin'-mazing.And that's all I have to say, because, frankly, what else really matters? Been here a lot and loved it. But the service has got really uneven. Here on a Tuesday night and waited 30 minutes for a vegetable plate. Same thing happened to table next to me. Restaurant wasn't busy. Wait staff just forgot. been upstairs and downstairs. always packed when i visited. upstairs is cool, though not a whole lotta food selections. i like the way that you order drinks from upstairs... you tell them what your preferences are, what elements you want, and they just make it. no menus. though i didn't particularly love my drink...... so i'm not sure if that was a win afterall. cool place though. also - this place is not cheap.
## 214 So I must admit based on all the reviews and what other people have told me I came in here with some pretty high expectations and I must say I have never been soo underwhelmed in my life.  The service was curt and indifferent, the facility was an upgraded hole in the wall, and the cheap prices are the ONLY thing that's keeping me from giving this ONE star.  That's right ONE star.Seriously, for those people who say the food is good.... you all need to get out more.  The pho broth basically lacked any apparent flavor.  The noodles were plentiful, but if I'm eating noodles with a tasteless broth it really doesn't matter how much noodles you give me because it'll still be tasteless.  And yes I may be an asian food snob, but when it comes to ethnic cuisine you really need to abide by one simple rule if the people of that ethnicity are the ones there its probably pretty good.  This place was full of people of shall we say a more european decent and my friend and I were the only asians there.  Nuff said.So if you're looking for a good bowl of pho definitely check out I Love Pho (they have three locations) instead, its a little more expensive, but the broth is much better.  That said, if you prefer bland broth and tasteless noodles I have some styrofoam you can come over and munch on as an appetizer.PS THEY ONLY TAKE CASHPSS can someone tell me why all the bench cushions are ripped and somehow they find it okay to not fix them???? best pho in the area Not impressed. Place seemed dirty. Staff were down right rude to our group, and kept forgetting some of our orders. Tables were filthy. Broth was bland. We won't be back. Granted, Than Bros. was my first pho experience, and their pastry puffs are just awesome!  They're not a bad choice by any means, but I always feel like I'm being forced to order and eat at a drive through window.  I wouldn't recommend coming here if you want to eat in a relaxing atmosphere or spend time having a conversation with your friends.  A little bit dirty, table cloths and walls could be cleaned more often.  But, need to eat fast and don't want drive through, they're the next best thing. Wow, this place is a dump inside. Be sure to hit the can before you go because you DO NOT want to find yourself in the nastiest potty this side of Hanoi. That said, the Pho is great. Probably the best in Seattle. And the prices are dang cheap. When it's raining, the pho calls to me. Noooodlles. Brooooth. Weeee're waaaarm.I'll admit - it's a ugly restaurant. Last time we went, my booth seat was ripped with its spongy innards gaping out at me. But it wasn't dirty, mind you. I've never been to a Than Brothers that was dirty.What I like most about the Than Brothers (Ballard and Capitol Hill) is the predictability. The small chicken pho is always exactly the same, and that's the way I like it, thank you very much. Bite-size pieces of chicken, a perfect knot of noodles, happy little green onions floating in a not-too-salty chicken broth. Add in some bean sprouts and some Siraracha, and I'm good to go. The cream puff is a bonus.All in all, it's good (and cheap!) enough for me on a cold and rainy night when the idea of making my own soup is unappealing. I just moved back to Seattle from Boston...one stop I had to make was Than Bros in Ballard.  I had a hard time finding good Pho in Boston- but I can find it here in Ballard, that rich mecca of asian heritage- er, wait...When I go to Than, I'm not looking for ambience- I'm looking for a piping hot pot of beef, noodles, plum sauce, basil, sprouts, chili sauce and lime...and that's precisely what I get.  I don't mind that the waitstaff never refills my water, or the place has a wierd bready smell.  No, I'm looking for something to slurp and to top off my meal  with a cream puff, and Than does it for me. Delish~Free cream puff!  And the key is to go with a friend who doesn't like cream puffs so you get 2 for free!  And if you're a real piggy you can buy 3 for a whopping $1.25.Don't be scammed by their "size chart".  My friend Erin and I decided that no matter what size bowl you order, they bring you the same amount.  Just smile and point to the one you want and eat your free cream puff!  Cash only too, no check buster I have been to this location several times; some take-out, some dine-in.  I vote for take-away at this place.We went there the other day for lunch.  My pho got to our table in about a minute and a half.  No complaints about service; it's quiet and quick.  The food is good for the price.  Great on a rainy day.However, they could use some new booth cushions.  Both of our seats were worn out.  The foam in the cushion wasn't just visible... enough of it was exposed that it was disturbing.  We're not talking just cracked vinyl here.  Had I been sitting in the middle of the cushion, my butt would have been riding the dirty foam.  That said: if you're looking for quick, cheap, filling food around Ballard - this is your place.  Just get it to go, or choose a table with chairs. Good enough. I ordered a medium, so I'd have food for tomorrow, too. The broth is bland without half a cup of hoisin sauce, and a liberal dose of sriracha (aka Hot Cock) as well. $6.36 for two meals can't really be beat. I considered an all-pho diet, in order to save myself a bunch of money on groceries. It seems like I go to the grocery store, spend $75, and have nothing to eat. I know I'll eat the pho. Making decisions about my food based only on price doesn't sound prudent though, and I quickly did away with the APD (all pho diet) craze I was about to launch. One thing I love about pho is that you can eat until you can barely move, pee a couple times in the next hour, and be back to comforable waistband territory with little effort. Which leaves one to wonder, is it really food? Sure it is. just don't count on it to be more than it really is, which is a bowl of very hot water with some noodles in it, maybe some meat or tofu, and not much else.It will make you feel better if you're heartbroken, stressed out, sick or otherwise miserable. Than Brothers is the bar by which Seattle Pho is measured, in my book.  This is their newest location, in addition to Greenlake, Cap Hill, and the University District.Sure, the ambiance isn't so great.  But you don't come to a Pho place expecting that - how do you think they can keep the prices so low?Their signature cream puffs are brought to your table before you eat, and I usually eat them as an appetizer.  You can always get more to go, 3 for $1.25.  Get a whole tray on your way to a party, everyone will love you!I've only ordered the chicken and veggie pho, but I've always had good quality chicken.  Try a iced vietnamese coffee too.  This place is a must when you are getting over a cold.  Just don't sneeze on Buddha. The best Pho I've ever tried! Any type that you order, don't forget to get some extra meatballs! The broth tastes so good that I always eat/drink it all. For a cheap price, you'll eat until you almost burst! They have four sizes: small, medium, large, and extra large. The extra large size is so big that I'm always reminded of a basin whenever I see it, but it only costs about $6-$7. Food comes out really fast. So if you're really hungry, Pho Than Bro is the best place to go to. They serve you big portion of food in about 2-3 minutes after you place your order. Moreover, they give you a really tasty cream puff as an appetizer for free! If you want more, though, you gotta buy 'em. Pho is perfect for cold weather, but even during summer, they have air-con inside so you'll be able to enjoy your Pho in any season.Downside: The place is not so clean. Sometimes the tables are sticky. And they only accept cash.Some warnings: 1)Pho is famous for its distinct, onion-like smell. I don't know if people actually like the smell, but I don't, however big my love for the food is. So, I'd reccommend that you go there wearing your not-so-favorite shirt, because you'll have to wash them right away. I wouldn't want to go to important meetings or parties right after I ate at a Pho restaurant, because I think people will think that I have a really bad body odor.2) I usually take the bus. If you like the broth like I do, don't attempt to run to catch a bus after you eat. Your stomach will be all jiggly with broth. I am a big guy and I can eat a lot but I can just barely finish a small bowl here.  They are HUGE.  The Pho is good particularly when you add the rooster sauce and brown goo.  Ummmmm brown goo.  The price is great downright cheep actually.  What more you ask....free cream puffs, oh my.  There is no atmosphere to this place seems like a stale lunch room cafeteria.  And there isn't really any service.  They take your order bring it in no time and that is it.  If you go in expecting more, then you will be disappointed.  Enjoy the Pho and creampuffs cause that is all there is. i'm pretty surprised at how low some of the ratings are, especially those that seem to factor in atmosphere/ambiance or the service as a major flaw.  it makes me think that seattlites have no idea what "dive" really means because, while the ballard location isn't "charming", it's a freaking pho house;  it's clean enough & all the chairs & tables match.  the server not chatting you up?  maybe it's because english is his/her 2nd language & hell, they've got noodle soup to make & serve.  personally, i love how fast our order is taken & our food is served (& they've never gotten our order wrong). i also think the soup is pretty good, although i will admit to getting the same thing everytime.  lots of noodles, lots of meat & i garnish the hell out of it.  to me, that's what pho's about. Edible. Adequate.Not stellar. Probably the least best of the chain.Hate all the mirrors. Service not great.Broth not favorful, however, veggies were fresh.Good creampuffs. Despite the last two positives, this place does not live up to the mediocrity of its siblings. This place is cheap, tasty and has great service. I have found that some hole in the wall places have sketchy Pho ingredients.... Wondering it the meat is old smelling it over and over before leaving hungry. But Than Brothers is always fresh and clean!
## 219 This place is great. $5.99 entrees during happy hour, 4-9.30, cash only, so that you're able to escape having only spent a total of $8 and carrying a full takeout box because their portions are enormous. The service has always been fast and courteous, although they are a bit slow at times about water refills. Last time, we grabbed a pitcher from the bar (a big group of us was seated in the back bar) and they seemed to get the hint.The food is consistently good. The pad thai is delicious, as is the tom yum noodle soup. Spice level is hearty, so consider ordering a star below your normal pick. I've never had a dish here I didn't like. The food was ok, acceptable but not great. Service was mediocre, and the staff i feel is kind of incompetent. I had the phad thai and it wasn't spectacular. It was a bit on the salty side, the color of it is way different than most other thai places, I think they use soy sauce, where as most thai places don't.On top of that, they have a "cash only" happy hour, this is the part that ticked me off, because they post outside "happy hour from x-pm to x-pm" but when you go in and sit down, it says CASH ONLY for happy hour prices. Bait and switch! I wished I knew about this place when I was still knee-deep at UW. I found out about this place a few months before graduations. 4-8pm HH and all day Sunday I think? $5.99 for Thai food? SOLD! It's pretty solid Thai food, service is nice albiet disorganized sometimes, very cramped and not good for large groups. Cash only for HH prices... two noodle dishes came out to $13.18 pre-tip. YUM!I'd come back if I happen to be in the area, but sadly I'm not. One of my favorite affordable Thai joints in town! My favorite dish here is the Pad Kee Mao. I'm pretty picky when it comes to the right squishiness of noodles but they've got it down pat.  As mentioned by countless others, be sure to come here for their happy-hour 6$ specials for (almost) all entrees with cash, which seems to be pretty much active all the time. It doesn't hurt that they've got a nice sense of decor and a chill music mix to give the right atmosphere. This is my go-to place on the Ave for consistently delicious Thai food. Unlike many of the others, I have yet to hear complaints about heath code violations or general gross-outs while dining. This place was a little-known gem when it first opened 2 years ago but it has now been discovered due to the high quality food and large portions for such a reasonable price. They have even extended their $5.99 entree happy hour (cash only) to 4-9:30pm I think. The only downside is that with their well-deserved success, the rushes have become crazy and the small staff sometimes has a difficult time keeping up with the demand. Unless I'm meeting friends for food I tend to order out during happy hour. The pad thai and the pad see ew are equally delicious but I tend to favor the pad see ew with fried tofu. There is something about the proportion of noodle to veggies to tofu to egg to spice that can't be beat! Also- if you're absolutely starving or can't decide on just one thing the combo plate is the way to go- 2 entrees of slightly smaller proportions for $9. Satisfying and plenty of leftovers for that midnight snack or tomorrow's lunch. Good thai food. But  very awkward with the whole pay with cash for happy hour, only because they ran out of change. He couldn't break a 20 so i was scrounging to pay 6.59 (total for pad thai with tax, not bad!). I wasnt able to tip because I had no more money.  Next time I'll bring my card, although it is a bummer to have to pay extra for each entree when using a card.Good food. I almost gave 3 stars but I felt bad because the service was good and so was the food. Maybe because it was 6pm on Sunday night. I've passed by this place many times but haven't dined until very recently.  The menu has a decent selection of curries, noodles, etc. for good prices.  Happy hour rates are cash only.  The interior is bright and cheery and wait staff sufficiently nice.I went in for an early dinner of Pad see ew, which was tasty.  My only complaint is that the food arrived at an unnecessarily long interval between my friend's dish and mine.  It wasn't busy, so I hope this isn't a bigger problem during peak hours. "Great food takes time"4 stars. And I'm not talking about my spice level. The food at Banana Leaf is delicious. The pad thai comes near perfect all the time ( they forgot a lime one time, but no biggie). I can see that the cooks at Banana pay attention to detail for every dish that goes out. Massive props for a place that's consistently busy.Your patience will be rewarded if you decide to dine here. Tips:-pay with cash- avoid the lunch and dinner time rush-prepare to wait at least 15 minutes or more (yes, I timed it) I have been there once for dinner. It was very crowded and I had to wait for about 15 minutes for the food. The staff was friendly but the service was only okay. I usually order to go here and the waitress is very helpful. Most of the time the food I order is ready when I arrive in about 15mins. My favorite dish here is the cashew chicken and the Vietnamese vermicelli with bbq pork.  I would say the portion is small for the price you pay for the Vietnamese bbq pork dish, but the taste makes up for it. The vegetables are usually fresh which is a bigggg plus. The thai food is decent compared to other thai restuarants on the ave. Many times the pad thai taste a little dry to me.
## 252 - Locale Destination - It's on 45 Th St and Latona Ave NE in NE Seattle. (Wallingford)The restaurant was a house. Unless you see the big sign at the front, you wouldn't notice as a restaurant. Decor is very chic - white, woods, splash of pink - like a cafe. Very cute. Bathroom is upstairs. No parking, so park on the streets by the restaurant. Nice and friendly service.- Gluttony Expedition - Spicy Ocean: Tons of seafood (mussels, salmon, squids, shrimps) and vegetables (bamboo shoots, bell peppers, mushrooms, green beans, Thai basil etc.). Seafood was cooked very well - very tender. The sauce was spicy and sweet. Good. I tried #3 spiciness (out of 5), which was maximum for me and very good/spicy. For $15, the amount of seafood and vegetables were amazing. Good dish. - Conqueror's Deduction - This was unexpectedly good. It's a bit strange to see "gyoza" (Japanese pot stickers) at Thai restaurant (they call themselves "Contemporary Asian fusion cuisine"), yet my dish wasn't so Americanized and got a good Thai flavors. For the quality and quantity, this was very reasonable. Great find ;) Yum Yum YumYum Yum YumYum Yum...errr...all the way?Sorry - I just, really loved the food I had delivered from this place on Friday night.  The appetizer platter came with fried veggie rolls, crab rangoon, fried tofu and gyoza, all of which were awesome and fried and awesome and not healthy!  The pad see ew and panang curry were both fantabulous as well - I had the pad see ew with shrimp and there were at least 15-20 good size shrimp in it!  Way generous compared to the norm and plum and tender - plus the size of the dish as a whole was ridiculous - easily would feed three.  The panang curry had tons of chicken and peppers and onion in it and was deliciously spicy and also made for three good sized plates.  All in all I'm super bummed that they only deliver to my buddy's house and not all the way to mine, but I'm glad I had the opportunity to try it and will probably get take out from them in the future. My roommate and I order from Djan's at least once a week (through Eat24) and we love almost everything we've tried, especially the crab rangoons and fusion noodles. Super delicious. We really like that their spice levels are consistent and the 4 and 5 levels are HOT. More hot than a lot of other Thai places.Also: NO styrofoam delivery containers. Very important to me. Had the Penang and it was top notch. The samurai phad Thai was also groovy. Ambience was casual but upscale. If you are looking for some good Thai look no further Djan's Dining is the best Thai food I've had in Seattle so far.  My husband and I have been to the restaurant once but mostly order our food. The place is a cute, minimalist, white space (which I liked) and the service was on point..  They have a particularly delicious egg plant dish (I don't remember the name), and tasty Pad Thai and garlic chicken mmm.  In certain days the delivery took a bit longer than expected, but these orders were during peak hour times on rainy days, any other time the delivery was on time (this time delay is why I gave 4 stars vs. 5 stars in rating). I have only ordered from Djan's once and it was take away, but based on this order I am reluctant to order again. I ordered Pad Thai so that I could compare to Sea Thai, where i have ordered in the past. Boy, what a difference. Maybe tonight was an off night for Djan's, maybe they have a new cook or something. I will probably give them another try, but this order was seriously sub par. No spice, no condiments, no peanuts, no scallions, very few bean sprouts, no flavor, just a pile of noodles and some over-cooked meat. Way too little care for $13. Sea Thai is way ahead of this.I want to point out that the food was on time and the staff was friendly and polite. Still, I had to get into the kitchen and finish this Pad Thai myself. I love this place, great food and they deliver to me which is even better. The garlic beef or tofu if awesome as is the djan fried rice and it comes with curry chicken pieces on top so it's a meal. Don't  order the pad se ew for delivery, it gets mushy, not sure how it is in house. My biggest problem with this place is the inconsistency on the spice stars. I've order 4 stars and 1 star on the same dish and it was exactly the same. Other times 1 start was 1 star. There's no way of knowing what you'll get at each visit. This is a major problem when sharing with a child or if your personal tastes are for the mild. If you like spicey then you'll have no problem. The restaurant itself is an old bugalow house on N 45th in Wallingford. Would be good for a quiet dinner but not great for kids as you can hear everyone else and kids would be heard for sure. Overall great food and service. Worth a try. Cute Thai restaurant with a nice clean, warm atmosphere. My friend and I ordered from here last night (delivery). Usually, I get Thai food from a place down the street, but we needed the food delivered and had limited options - good thing. I'm glad to have finally tried Djan's, after driving by it for years. We got the Pad Khee Mao and the Pad See Eiw (both of our favorite noodle dishes)? We agreed that they were the best we'd ever tried in Seattle. The veggies were flavorful and cooked just right and the tofu was excellent. We were impressed that the noodles were not all glued together, like often happens with delivery food. I thought the prices were reasonable for the Wallingford and the delivery was so fast for a Friday might. They said 45-60 minutes, but they arrived in 20! The portions were also generous. I will definitely be choosing Djan's from now on. When you need some Pad Thai and you need to eat it in your jammies within about a half hour... look no further! I can only offer delivery rated opinions, but I asure you my palate is as fine tuned at home as it is in person. The spring rolls are fresh, the (fried) tofu is crisp, and the pad thai is delightfully flavorful. Definitely a go-to in Thai delivery! Definitely the cleanest and freshest tasting thai food around the area.  I order delivery from here every couple weeks or so and the food always comes on time and hot.  Only complaint is that soemtiems I'll ask for 4 or 5 start spiciness and it's still not strong enough for me.
## dirty
## 541 0
## 542 0
## 543 0
## 544 0
## 545 0
## 546 0
## review
## 541 So while at my "job transition workshop" awhile ago I found myself in a somewhat unfamiliar area and unsure where to go for lunch. My classmate wanted teriyaki, so off to Midori we went. The shop is a little dingy, but I wasn't really expecting anything better. The prices seemed a tad high considering you can get a fancy sandwich or entree anywhere else for about the same. Anyways, I ordered beef with broccoli. I think she ordered chicken teriyaki and a salad. My meal was OK. Not great, but I ate most of it so I can't say I hated it. That being said, I can't imagine what circumstances would bring me here again. As an aside, I swear the sign outside says "Midori Teriyaki #4." Are there three other ones? If so they haven't been Yelped...
## 542 Everything on the menu is $6.95, even combos! Not only that, but Harried & Hungry is ridiculously fast and there is plenty of seating. It's generally not crowded, unlike most of the other places in the area. My only complaint is that the combos seem a bit limited. I wish it was ANY half sandwich with a cup of soup or salad, but for the half sandwich combos, you're stuck with a choice of mozzarella caprese, turkey swiss or chicken pesto. In the midst of enjoying my first H&H lunch combo, and feel the need to write a review. Basically, check this place out. For $6.95, you can get a combo for lunch (which is incredibly filling by the way). I didn't eat breakfast today, and can barely make it through my "small" salad and half-sandwich.The house salad is really bomb, w/ beets, mixed greens, goat cheese, and walnuts. There was just enough dressing, in my opinion. The chicken pesto sandwich on the other hand, is good, but kind of dry. Word on the street is that the turkey club is better...Also, if you order online you can skip the crowd to pick it up (although it wasn't very crowded at 11:55 a.m. when I walked in), and they give you points every time you order that save you money on your lunch. I'll definitely be back! Really solid lunch place. *Really* solid lunch place. Based on strength of reviews, I had to give this a try. Plus, it's west of my office, an area I feel I'm ignoring during my stay in Seattle. The menu says items are pushing $8. and that would still be a deal, given the food. But a sticker on the menu stand says "Ignore these prices! $6.95!" Which makes this a stellar deal. Even after I bought a fancy drink it wasn't $10. My lunch was a chicken pesto sandwich, which had nice pesto (although could have used more) but importantly had warm chicken, good tomatoes and spring greens, and was on a warm roll. Just having it be warm (and not the whole sandwich, so if they microwave, they at least strategically microwave) makes a huge difference. Very good. But the real winner was the Harried House salad. Spring greens, sliced beets, sliced pears, candied walnuts, and goat cheese, with a good light vinaigrette. A better salad than I would ever expect in a $6.95 combo, on quality of ingredients alone. The beets were good - and I don't even like beets. The pears were the perfect texture, both sweet and with actual pear flavor. the goat cheese mild and creamy, and the candied walnuts...well, they were really more like toffee, they were so candied. But I'm not complaining - this whole salad was awesome. Only two complaints: the salad was in what was kind of like a deep cereal bowl. Hard to eat. But this is hardly a complaint worth mentioning. More substantive - your sandwich choices are limited in the combos. Will I be able to try a different sandwich knowing I can't get the salad too? They don't have half salads, so I just might be stuck in a rut after the first visit. I love this place; it's an under-appreciated find!Sandwiches, soups, and salads, all modestly priced and ready for either sit-down or take away.  I'm a fan of the combo lunch, which gets you 2 out of 3 choices (soup, sandwich, or salad) for about $7.  The food is really good, the service is fast, and the staff is friendly.  And it's rarely busy!  One of my favorite spots to sit down for a meal or just pop in to grab lunch on my way back to my horrifying office job. It's 7:43 a.m., and I have to leave for the bus stop at 7:46 a.m.I forgot to make my lunch.  Crap.This is where Harried & Hungry comes in.  I work downtown, so the lunch prices are ridiculous (ten bucks for a salad? Please.) and blah-blah-economy-blah-blah.  Honestly, I just think it's silly to pay a ton of money for something as simple as bread, meat, cheese and a couple of veggies.Every single lunch entree at Harried & Hungry: $6.95.  Ordering online?  Get 30 cents off and, oftentimes, coupons in your email to take off another 10%.The sandwiches and salads are good, too: I've had the Harried House salad (which made me realize I like beets) and the Turkey and Swiss sandwich.  Both were filling, delicious and made with quality ingredients.  I'm not a fan of pickles, but if you are, you'll be pleased to note that each sandwich is cut in half and topped with two baby dills.  (Aww.)When I don't bring lunch from home, this is easily the healthiest, cheapest alternative.  (My non-healthy alternative: MOD Super Fast Pizza.) We happened to be walking down the street, looking for some food, when we saw Harried & Hungry - "That describes us exactly," I exclaimed to my friends, and we decided to give it a try. And it was so good. I had an enormous complicated salad, with bacon and hard-boiled eggs and chicken, plus a peach smoothie that tasted like peaches. Seattle is a little far away from Portland, but if I have the opportunity I'm pretty sure I'll stop by again. Stopped in for a quick lunch. Fast, courteous service. They gave us free cookies that were quite big and fresh! I got an egg salad sandwich and a house salad, both of which were good. The egg salad sandwich had lettuce and tomato on it, which was kinda odd but tasty enough; the salad was better. Reasonably priced, vegetarian friendly. Would go there again! Harried & Hungry is a solid four stars.  I have been here many times.  My very first sandwich downtown was from H&H when our office moved down here.  Better than average sandwiches, great soup, (I love their rendition of beef/barley) and great salads.  Eat here or take it with you.  They use those goofy flashy/blinky beeper things to tell you when your order is ready - which don't add to the experience, but are handy.
## 543 Let's just say, Spiro's is working on more stars....sort of like cub scouts and badges.I've enjoyed:The stuffed calzone with pepperoni and sausage.Spaghetti with marinara and green salad. (vinaigrette dressing)Tortellini with white sauce and green salad. (ranch dressing)I've had a better calzone from Pizza Hut.  The bread was bland and thick.  The sauce was slightly spicy but I like the stuffing to be the main part of the calzone, not the breaded pocket.  I will not order that again.The Spaghetti with marinara was good, more or less mediocre.  I liked the salad but found it to be standard.  Good crunchy chunks of lettuce and olives sprinkled heavily atop made me smile.White sauce Tortellini is the only thing that stands out in my mind.  It did taste a bit salty but it was creamy and good.  Definitely not good for me but good for the taste buds.The dressings on each salad were nothing special, probably bought wholesale at Costco or something.Like everyone else, I felt more like it was Greek than Italian.  Eh.  Good waiter, male, cute, damn witty and crazy brows!  He sort of made it a better experience.  The atmosphere is clean and neat.  Definitely a kid friendly and family oriented environment.I'll give their pizza a try but the menu is pretty limited.  I'm also going to try the Lasagna, Ravioli, and a few of their sandwiches....although I'm prepared to be disappointed by the bread on those subs! Wonderful Greek style pizza - golden brown crust, with a nice crust of cheese on top. In my top 10 pizza places for sure!The atmosphere leaves a little to be desired for me - I prefer something a little grittier, or even trendier - the inside of Spiro's is kinda boring, which does take a little something from my pizza experience. Still, it's damn good pizza!
## 544 Hands down, the best dim sum in Seattle. We've done the rounds, Sun Ya (delicious, but has slipped in the last few years), Jade Garden (fuck your lines!), and have decided that Harbor City is our new favorite.We ended up here on Sunday after a failed visit to House of Hong (blech) and were treated to hot, fresh, flavorful dim sum classics. Clearly they're doing something right, as we were one of only three tables of white folk (my litmus test of authenticity). Harbor City, FTW! My friends and I were really excited to try some dimsum in Seattle.Thanks to the great reviews on yelp we found Harbor City! They're open from 8:30 - 10 PM from Mondays to Thursday and on Fridays to Sundays they're open till midnight! Can't really beat a place that opens late to get your fix!We went here on a Monday and parking around the area was not a problem and the restaurant wasn't that busy! Service was fast, and they were really friendly and patient considering it was our first time there! We ordered a few items on their menu, but when it didn't come out, we reminded our server to cancel it and she did it with no questions ask! The food was a hit! We were very satisfied with most of the food and it was pretty cheap! - 1 because one of orders we really wanted never made it on our table.- 1 because they didn't have a couple of items we asked for. Still a solid 3 STARS!!! I've only had "proper" (read: a cart comes around and you pick foods off of it!) dim sum once, when I was young, but the memory stuck with me and ever since then I've thought about dim sum at least once a week. There's just something about a traveling buffet, a wandering visual, tactile menu, that appeals to me.So when I was wandering around the ID with my friends and we got hungry, it seemed like a great time and place to try to find some dim sum. Harbor City was well-reviewed on Yelp, so we went there.The place was nearly empty on an early weekend evening, but the few tables that were full, were full of Chinese families, which I took to be a good sign. I've barely ever had "real" (versus "American") Chinese food, but I've an adventurous and open-minded palate, and I'd rather try something strange and authentic than always stick with what I know.(I'm also mostly vegetarian - and unfortunately Chinese food isn't super veg-friendly - but I do make exceptions when I want to try new things.)Harbor City didn't have dim sum as I remembered it. There were no carts, no steamy buns full of red bean curd and pork. There was a menu with (unappealing) pictures; we ordered off that.The food was mostly good. Nothing astounding, but I enjoyed the dumplings in their tiny, steamy baskets, and the pea vines were delicious. The service was prompt and respectful, if not very warm. I'm not sure that I'd go back - it didn't blow my socks off, but neither did it disappoint. Except that there were no carts. No carts! My search for carts continues. Honest opinion, not as good as Jade Garden. Simply because the food here is moderately just "OK". If you're here to just stick with the generic dim sum items, go for it. If your here to try something new, variety is not your friend. Great alternative if you want to beat Jade Garden's wait time.Dishes ordered: Dim sum Ok, I have been here over 5 times now and I can say without hesitation that this is not good dim sum. There is a delicate nature to good dim sum that this place lacks. For example, the shrimp dumpling filling is overly fishy yet lacking in flavor, the wrapper is an unusual texture and they are thrown into the steamer in such a way that when they get to your table the wrapper is already halfway free of the filling and they have all stuck together. Same for all their seafood dumplings. The chicken feet are just ok and the pea vines are overly greasy. Overall I don't understand why people wait for this place and I will not be doing so anymore. The places on the Eastside are not great, but they are better than this place. If you have to eat dim sum like I do, regardless of wether it's the best you have ever had, then it's worth the extra toll and gas to go to Joi's on the Eastside. It's not SF standard, but it's better than this place. Yeah I am a fan. I think this place is better than any we have down in Portland (correct me if I'm wrong because I am not very familiar with the SE district). And cheap.  Foodwise, I loved the sticky rice, green beans, and everything else. The fried taro, in my view, had too fine crust.BTW, watch out for a barking guy at the free for-customers-only parking lot at the corner across the street from Harbor City.  We parked there to look up a restaurant and the guy literally barked at us (the kid and me) when we walked away from the car.  I don't know what he was saying (I don't speak Chinese) but it took me for a while that I could communicate to him that another person was in the car and was about to move the car out. We got there at 11am on a Saturday, we only had a 10 min wait. The dim sum is excellent, the dim sum is bigger than at other places and the prices are so great. I like the fact that they have smaller portion plates of non dim sum items like the Salt & Pepper Calmari and the Honey Walnut Prawns (both of which were excellent). The place was packed and there were about 20 people waiting by the time we left at noon. Probably the best dim sum in the Puget Sound area.
## 545 This is such an odd place in a little residential area.  The bubble tea is good, though far from the best I've ever had, very sweet and highly flavored.  The crepes are good too, with generous portions and the prices are good too.  The interior is very comfy with a cute and warm feel, I would definitely come back. Great service! Mango chicken crepe was big, filling and fantastic. The croissant breakfast sandwich was smaller but still very tasty. Can't wait to come back for the nutty banana crepe. If you're nearby, this is definitely worth stopping in. Weird but comfy environment.  Best Taro bubble tea i have had.  Employees were quick and polite.  Only downside is that it is semi difficult to get to. Coffee is decent.  Service was good.  Not sure what all the hype is about this place.  Could it be the bubble tea?  I will try that next and then update. Not only the best boba, but also outstanding service.  The employees stayed half an hour after closing time on New Year's Eve so that I could pick up my wallet which I'd left there.Also, try their crepes- they're great!Thanks to Pearls I've only been here twice, but always MEAN to go by more often. Maybe 2012 will be the year for it. :-)The bubble tea is great, as are the banh mi sandwiches, soups, and other snacks. The owners are sweet as pie and SO helpful, as well, they seem to really want to take good care of their customers (a rarity these days IMO). If you're in the Delridge neighborhood and want a nice place to stop for coffee/tea and a warm snack, highly recommend! There's a cutie behind the bar with dimples, dorky glasses and a baggy sweatshirt. I came in and surprised her with a rose....she smiled..and I wanted to pinch her cheeks. I ordered the mango chicken crepe....portion is not bad at all it was just right. Presentation and taste were great....the epitome of Vietnamese fusion with french influences such as the pickled dicons.....the cafe is welcoming and beautifully decorated with just enough colorful lighting for the correct ambiance. A bit out of the way for me but it was worth the time and trip ;) I don't know about this place...nothing special to me.  Costco muffins and cookies, bags of doritos and gummy bears...am I missing something?  I didn't try the crepes but I hope they're homemade.  My vanilla latte had no vanilla, not that I could taste anyway and the coffee wasn't very strong.  The people behind the counter were very nice though so that's a plus.  I don't know...nothing to write home about, I've definitely had better. I've been looking for tea/coffee shops like this since I've moved here from Boston. I love the atmosphere and they have very good selection of coffee/food (for a coffee shop). I'm not sure if it's owned by a couple but they were the sweetest people. The food prices are also very reasonable. I ordered a bbq baguette for around $5 and it was pretty big. There is free wii there and tables with plugs so for anyone who wants to drink/eat good coffee/food while working, that is the place to go. Wow. I don't know how anyone could go to Starbucks with a place like this around! Everything we ordered was just fantastic. My green apple bubble tea was delicious, as was my husband's white chocolate mocha, which was the best one either of us has ever had. We split the meatloaf baguette, which as my husband's choice and the smoked salmon crepe, which was mine. I would not have chosen the meatloaf but boy, was I glad he did. It was soooo good, with lean, melt-in-your-mouth meat. The smoked salmon crepe was equally amazing. We both loved the extra kick given by the daikons. We took our meal to go but the space is adorable and very welcoming, and so is the staff! Loved it!! I don't have any complaints, but didn't wow me enough to give 5 stars.  I never ate food here, and the bubble tea is good.When we lived in West Seattle, this was our go-to bubble tea place, because the other places in the area are thumbs down.  I quite like the decorating in here, maybe wouldn't quite jive with interior decorator tips, but TONS of personality!  Which is what I love!  I always fantasized about plunking down and studying there for a while, but we always just made a b-line there and zoomed away again. Cool eclectic interior like a really coze college lounge.  At first I am thinking they only serve coffees and bubble drinks, but they have food, too!  Excellent Food!  I had a Hot Mama crepe with spicy links.  Delish!  The pearl drinks were great, too.  Lots of flavor options. I love this place. Best coffee and bubble tea in west seattle! They cook the tapioca just perfect. The people that work there are so lovely and nice. The restaurant has a  great vibe too and the set up is absolutely fantastic! They have wi-fi so you can grab a cup of coffee and maybe even try their delectable crepes or baguette sandwiches and eat while working on your laptop. This place is fantastic. The ambiance is delightfully rustic (ie: old and eclectic, yet comfortable.) The service is great, always with a smile. Best of all it's one of the few local businesses in the part of the Delridge corridor.The bubble tea here is great. The tapioca pearls are always fresh and tasty. Somehow the bubble tea I get here tastes better than even bubble tea from the international district.If you're hoping for food, the selection is limited but doable. Crepes are the best choice and seem a bit expensive, but occasionally they hit the spot! I love Pearls!I work about 3 blocks away, and it's always my go to when I'm hungry for lunch. They have so many options for such a small shop and their bubble tea doesn't make me sick to my stomach like Oasis does. Outside, Pearls looks really small, and it is really small. But when you go inside, the place is so cleverly decorated, every inch of space carefully utilize to maximize its use! Comfy couches, cute little tables, they even have space for a mini fountain in the middle of the place. For sure, give this place a visit. Their crepes are soooo good!Plus they walk out to bring you your order while you're sitting. How sweet! Love this place. So, barely tried a TASTE of bubble tea the other day somewhere else and was instantly hooked. I had driven by this Pearls a million times in the past few months and knew they had tea, so thought I'd grab one on the way home. So glad I did! It's a cute little place, with some parking on the street and 4 spaces on the left side of the building. I ordered a blended Taro with tapioca, a huge cup and only $4! It was sooooo good! Such an earthy, sweet taste...I am really hooked now. I saw they have crepes and other food items and I'm sure they are good too. I'll try them sometime in the near future :) Good job, Pearls! It deserves to be busy and packed, but I'm kind of glad that it's not. Great service, super casual, decent prices. The cookies are so-so (kind of doughy), but the coffee is top-notch and the bubble tea is great. The layout is really cool, so you can watch TV (they usually have sports playing), or just hang out in your own little corner. It's all very well-loved furniture, so you're not worried about dribbling on yourself or anything like that, and it's super-comfortable. Feels like when you'd hang out at your poor friend's house with the really nice mom who happens to be an awesome cook.
## 546 I really enjoyed my recent small group dinner with some fellow yelpers here last week.The nachos certainly lived up to the good things I'd heard about them, as did the margaritas. The hold back, mainly, for me, was the lighting. I get it; ambiance, mood setting, all that. Really, I do. But I am "night blind". I really cannot see well in the dark. And certain light levels over others are the *worst*. When I walked in looking for the group (who I knew was there, the host had *just* stolen my parking spot not 5 minutes earlier) I couldn't see a bloody thing farther than a few feet in front of me. Navigating that in a full bar while intoxicated would be the death of me, and anyone in my falling radius.Overall, though, it was a good evening getting to know some new (to me) yelpers and seeing some veteran favorites again. I'd do an event on the "upper deck" level of the place, and I'll try and trek it out to WS again to sample a bit more of the menu. As often happens on my West Seattle excursions, the scene is as follows: there is a beer festival of some sort or other at Beveridge; I have partaken; hunger and a need for sustenance ensues.This time, I wanted to check out Mission, which has been on or slightly under my radar for years. We arrived about halfway through the daily happy hour, and despite having sampled two trays of barleywine between the three of us, we hadn't imbibed to a margarita-preventing level...so promptly ordered a couple. Plus two plates of three tacos (shredded chicken), a couple of orders of chips, pico and guacamole, and some sweet pappas (roasted sweet potatoes with a bit of crema, some crumbled cotija and green onion atop) to share.The dishes were your standard American-Mexican fare, meaning that if you're a haunter of taco trucks whenever possible, these won't be your thing. So they weren't mine, but they were fine (especially for the price!). The shredded chicken was mild, the guacamole a bit dry, and the sweet potatoes a world away from the sort of patatas bravas that stir my Latin-loving heart. Nevertheless, the platters arrived promptly and with a smile, and the margaritas were impressive: heavy hint of decent tequila, without the cloying, artificial sweetness so prevalent in bars 'round the world. It's the rare spot that makes me want to return to taste tequila, but Mission might be one given their expertise with a lowly happy hour drink.I can't say anything about the rest of the menu since I only tasted the aforementioned happy hour offerings, and I know that many bars and restaurants excel elsewhere. But for what I refer to as Mexpensive, I might be more inclined to head to Barrio (for inventive/Americanized) or El Mestizo (for vibrant/traditional) in my neighborhood if I'm craving some Latin flavor. Nevertheless, I look forward to a return to Mission: the interior is pretty sexy with the low-lit booths, prayer candles (they work, I swear!), hanging paper lights and the lovely brick-backed bar. And then there's that little matter of the many pages of tequilas.... The Mission is basically a clone of a clone; not to get all Old Testament on you, but Peso's begat (complete with nasty law suit) Matador and Matador begat Mission.  It's the new style of Mexican restaurant now days and a step away from the cheesier (pun intended) incarnations of Tex-Mex like Azteca, Toreros and Las Margaritas.  The newer places are better decorated, don't have the combo plates and don't dump copious amounts of shredded iceberg lettuce and cheddar cheese over everything.  a major improvement to be sure, but already these new places are beginning to feel pretty formula, too.I have no real complaints about Mission.  I really dig the space and appreciate that it's more bar than restaurant.  The food was pretty decent, but I have to tell you, the beef tacos across the street at Admiral Pub are better than anything I had on the menu at Mission and that's not meant to be a slight on Mission, it's just that somehow Admiral Pub is pretty out some pretty awesome tacos.  The bartender was great and overall this is a decent place. I like the mission.  Love the plantain chips, although the portion is a bit small for the price.   I think the food is flavorful and interesting, especially the pork dishes.The margaritas are great - very well mixed and tasty - not too sweet or bitter.I don't really enjoy how dark the place is.  Makes me feel like I'm spending  more than usual for dinner in a basement. If I'm having my birthday there tonight, April 28, at 8pm, that means it's a pretty cool spot. (pssst, this also means you're invited)It's a lounge-y-type of place, so if you're looking for crazy dancing and loud music, Mission isn't typically the place for you. NOW, there are definitely exceptions, Cinco de Mayo, which is coming up in a few days. It was ballistic last year. A band in the corner, people crammed in there. It was amazing. Awesome food, cool people that can actually converse without saying "WHAT???!!" all the time, and great drinks all around. Oh, by the way, the atmosphere and decor is unlike any other bar in the city. You can tell owner Peter really scored with the details. While I enjoyed the Lizzard Lounge from back in the day, Mission came in and made everyone forget it ever existed. Met up for sister's birthday celebration. Ambiance was great, food was very good and creative. There also appeared to be a private party going on upstairs which seemed perfect for an after work event.Music was modern, didn't really match the motif. We were also looking for a fruity drink of some sort and they really didn't have anything like that. Overall pretty good. :) Best margaritas ever.  Order the Bartender's & you'll be happy!!  Order 2 and you'll be in love.  ;) I like the mission and the fact that it has many veggie options.   I went there this evening with family and friends and had a fine meal.   Here is a rundown Happy Hour(4 star)Twice daily, 4 to 6 and 10-midnight.   Nice priced appetizers, drinks were a $1 off for drinks on the  Happy Hour menu.   Be careful if you are there close to 6 and order as if they get your order in past 6 you pay full price.  Nachos were really tasty as well as the house margaritas.Yum!Dinner(3 star):   I had the portabella enchilada's which were good, but heavy taste of cinnamon.   Portion was ok, but priced a bit high for my liking. Atmosphere(4 star):   Swanky, dark, nice candle light, rustic wood mirrors-I like the feel of the restaurant !Wait Staff (4 star)   Very good customer service and she checked back with us often.Price (2 star)----Veggie options should have been priced closer to $10 instead of $13.00 Next Visit---I think I will take advantage of happy hour and eat dinner at home for the price Mission is great for happy hour!  The three soft tacos are my favorite hh menu item, and they are fine with mixing and matching (unlike some other Mexican restaurants in Seattle that forbid this).  The mix and match is great because the sweet pappas taco is AMAZING...but as a meat eater, I wouldn't want three of them.  The beer selection isn't anything to get excited about, but the tequila selection is fabulous.I have only sat at the bar, and the server has always been fast and attentive.  I would say, lacking in personality and friendliness - but certainly not at a detriment to our experience.  We missed happy hour by a few minutes and realized that Mission is strict on their happy hour times.  It was shocking how much more expensive the same 3 tacos from the happy hour menu were on the regular menu.  We skipped the pricey food and ended up sharing the chicken nachos - which were pretty tasty...but I like my nachos with the cheese at least slightly melted - the platter we had was just sprinkled on cold.  The ample chicken and black beans made up for the difference in me and Mission's nacho cheese temperature preferences.I disagree with reviews comparing Mission to Matador, though I can see why they made the comparison.   I feel like Mission has a much less commercial feel than the Matador and lacks the snooty vibe I get in there sometimes.  I will be back for happy hour again and again! I have never eaten anything but HH food here.  I'm sure they have a full menu, but I don't think I've ever even looked at it.  That being said, if you're looking for a place to grab HH, definitely try this place out.  The nachos are always great and plentiful.  The servers are always super attentive and don't seem to mind groups, even those where people meander in and out at different times. This is the second time I went to Mission, but this time I actually got food. The food was tasty and the portion sizes were what Americans SHOULD be eating, so for most that would be a bit on the small side. I was there for my sister-in-law's private birthday shindig and we got our own bartender, but service was quick and pleasant. Dish prices were about $10-15 for a normal dish and about $7-10 for appetizers. Drinks were pretty standard as well, but a bit on the strong side. Overall a great experience and it's always great to have places like this in West Seattle. I can walk to this bar in west Seattle but it's worth a drive as well. Fantastic food. Very dark with bar and table seating. Two levels. A turntable. Friendly bar staff. Perfect for finishing off an evening... And probably starting one as well! Great drinks, excellent food, and music as well Very nice atmosphere and music seems to vary in volume (from ok to too loud.)Drinks a bit expensive.Food good.Nothing special but then again that means (in Seattle) pretty good as the competition is so ferocious and so many places so good. My 30th birthday had crept up on me & needed a stellar place to have a party. Mission rents out the mezzanine for a great price. I met with Peter (awesome to work with!) and we planned out my special evening. It was an 80's themed party and all the guest were dressed to kill! The Bartender was incredible and the food, like always, was outstanding. I have loved every visit to Mission, but loved this one the most! They made it a great 30th birthday bash! Thank you for a great time guys! You are wonderful & I can't wait to come back soon and have some delicious Mexican food! Happy hour at Mission is one of the best in West Seattle.  For $5 you get three soft tacos which usually fill me up.  The ambiance is really nice too, and you don't have to be 21 to eat there.  The bartenders margarita is a must and the waitresses are always very pleasant. Sweet Potato Enchilada!....The waitress got annoyed after she had to move us to another table "no fault of ours" she was snappy and short the rest of the night which sort of ruined it but I had no expectations, so, hey. Great place for a first date! Food is good and happy hour specials are yummy! Mission is one of my favorite late-night happy hour haunts.Reviews of Mexican-style restaurants always seem to devolve into whether the cuisine is 'authentic' or not. I will say upfront that I generally could care less and only judge based on taste and experience. Weigh accordingly...They are probably too expensive during dinner hours -- expect to spend at least 20 bucks a person on dinner here once a drink is included. The food is tasty and usually spot on in terms of serving size.They have a very good beer selection (not just mexican imports but NW rotating) and also a good (but slightly pricy) tequila selection. They have  most of my favorite tequilas...The atmosphere is great -- low lighting, candles, and quirkiness. Anyone who's been to the Matador will be very at home in this more down to earth rendition of that theme...Happy hour this place comes into its own with good food and drinks and a kitchen open late. Highly recommend! Really liked this place, service was good and so was the drink.  Chips and salsa were tasty as well!  Didn't have any food off the menu but my friend enjoyed their calamari.  I look forward to returning! This was my second experience at Mission.  My husband and I tried the restaurant over a year ago and it wasn't memorable, for better or worse.  I suggested we try it again tonight, and I think it will be our last time there.  We mostly had a poor experience with service.  Our server seemed very uninterested in our needs and we actually got more help from the host, who may have also been the manager than we did from the bored server.  For the first time ever, my husband and I agreed that we needed to leave a small tip.  We are good tippers by practice, and tip 20% as a rule, sometimes better for great service.  Its a shame that we didn't feel  that great about being there, because some of the food had potential.  My fish tacos were really pretty good, and we had some interesting peach and cheese stuffed jalapenos to start.   The hubby rated his dinner as a 6 out of 10, and my friend was disappointed in hers.  They charge $3 for chips and salsa, which was a waste.  They were cold and tasted stale.  Mission: fail. Nice place, nice atmosphere, pretty tasty margaritas.  Still can't get over being charged for chips and salsa, then when we wanted a bit more salsa with our meal we were charged another $1.50.  Just seems crazy to me and resulted in me taking away a star. My husband and I came here on the recommendation from friends. I liked the ambiance right away. I ordered the Flirtini drink which was strong but lovely. I also enjoyed my sweet potato enchilada. Husband liked his food but didnt' love it. Our dessert was only okay, but presented in a fun way.  We'll definitely come back another time. I love the Matador, but this place tries (not successfully IMO) to be it's exact twin. I've tried the food here three times now, and ive found it to be pretty bland and unimpressive. The drinks are fine, and I love the decor. I just think the food is pretty meh, since I ordered something with habanero sauce and could barely taste it. Maybe my taste buds got burned off along the way? Great happy hour - good selection of drinks and food. Service was a little slow, otherwise an excellent happy hour for tonight! Food's very tasty, though not authentic. Upscale. For people who like some spice just ask for my fave "el Yucateco" and add heat to taste. Like it here . Nice tequila flight.
## 'data.frame': 546 obs. of 2 variables:
## $ dirty : int 1 1 1 1 1 1 1 1 1 1 ...
## $ review: chr "I'm only giving this place 5 stars for the severe nostalgia it invoked when I walked in the door.  Growing up as a vegetar"| __truncated__ "\"ewww.....bleh...\"  sums up my bao experience at Yak's.  I made a detour over to Fremont after Yelping \"Bao\" "| __truncated__ "This is my fallback dim sum place when the wait at Jade is too long.  Sun Ya has much better congee than Jade. it's rare w"| __truncated__ "Sun Ya = zzzzzzzzzzzThere is absolutely no variety here. I think we got some hum bao, shrimp dumplings, peking duck and mango p"| __truncated__ ...
if ((num_nas <- sum(is.na(glb_trnobs_df[, glb_rsp_var_raw]))) > 0)
stop("glb_trnobs_df$", glb_rsp_var_raw, " contains NAs for ", num_nas, " obs")
if (nrow(glb_trnobs_df) == nrow(glb_allobs_df))
warning("glb_trnobs_df same as glb_allobs_df")
if (nrow(glb_newobs_df) == nrow(glb_allobs_df))
warning("glb_newobs_df same as glb_allobs_df")
if (length(glb_drop_vars) > 0) {
warning("dropping vars: ", paste0(glb_drop_vars, collapse=", "))
glb_allobs_df <- glb_allobs_df[, setdiff(names(glb_allobs_df), glb_drop_vars)]
glb_trnobs_df <- glb_trnobs_df[, setdiff(names(glb_trnobs_df), glb_drop_vars)]
glb_newobs_df <- glb_newobs_df[, setdiff(names(glb_newobs_df), glb_drop_vars)]
}
#stop(here"); sav_allobs_df <- glb_allobs_df # glb_allobs_df <- sav_allobs_df
# Check for duplicates in glb_id_var
if (length(glb_id_var) == 0) {
warning("using .rownames as identifiers for observations")
glb_allobs_df$.rownames <- rownames(glb_allobs_df)
glb_trnobs_df$.rownames <- rownames(glb_trnobs_df)
glb_newobs_df$.rownames <- rownames(glb_newobs_df)
glb_id_var <- ".rownames"
}
## Warning: using .rownames as identifiers for observations
if (sum(duplicated(glb_allobs_df[, glb_id_var, FALSE])) > 0)
stop(glb_id_var, " duplicated in glb_allobs_df")
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_id_var)
# Combine trnent & newobs into glb_allobs_df for easier manipulation
glb_trnobs_df$.src <- "Train"; glb_newobs_df$.src <- "Test";
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, ".src")
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df)
comment(glb_allobs_df) <- "glb_allobs_df"
glb_allobs_df <- orderBy(reformulate(glb_id_var), glb_allobs_df)
glb_trnobs_df <- glb_newobs_df <- NULL
glb_chunks_df <- myadd_chunk(glb_chunks_df, "inspect.data", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 1 import.data 1 0 11.169 13.768 2.599
## 2 inspect.data 2 0 13.768 NA NA
2.0: inspect data#print(str(glb_allobs_df))
#View(glb_allobs_df)
dsp_class_dstrb <- function(var) {
xtab_df <- mycreate_xtab_df(glb_allobs_df, c(".src", var))
rownames(xtab_df) <- xtab_df$.src
xtab_df <- subset(xtab_df, select=-.src)
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Performed repeatedly in other chunks
glb_chk_data <- function() {
# Histogram of predictor in glb_trnobs_df & glb_newobs_df
print(myplot_histogram(glb_allobs_df, glb_rsp_var_raw) + facet_wrap(~ .src))
if (glb_is_classification)
dsp_class_dstrb(var=ifelse(glb_rsp_var %in% names(glb_allobs_df),
glb_rsp_var, glb_rsp_var_raw))
mycheck_problem_data(glb_allobs_df)
}
glb_chk_data()
## Loading required package: reshape2
## dirty.0 dirty.1 dirty.NA
## Test NA NA 200
## Train 273 273 NA
## dirty.0 dirty.1 dirty.NA
## Test NA NA 1
## Train 0.5 0.5 NA
## [1] "numeric data missing in glb_allobs_df: "
## dirty
## 200
## [1] "numeric data w/ 0s in glb_allobs_df: "
## dirty
## 273
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## review .rownames
## 0 0
# Create new features that help diagnostics
if (!is.null(glb_map_rsp_raw_to_var)) {
glb_allobs_df[, glb_rsp_var] <-
glb_map_rsp_raw_to_var(glb_allobs_df[, glb_rsp_var_raw])
mycheck_map_results(mapd_df=glb_allobs_df,
from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
if (glb_is_classification) dsp_class_dstrb(glb_rsp_var)
}
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## dirty dirty.fctr .n
## 1 0 N 273
## 2 1 Y 273
## 3 NA <NA> 200
## Warning in loop_apply(n, do.ply): Removed 1 rows containing missing values
## (position_stack).
## dirty.fctr.N dirty.fctr.Y dirty.fctr.NA
## Test NA NA 200
## Train 273 273 NA
## dirty.fctr.N dirty.fctr.Y dirty.fctr.NA
## Test NA NA 1
## Train 0.5 0.5 NA
# Convert dates to numbers
# typically, dates come in as chars;
# so this must be done before converting chars to factors
myextract_dates_df <- function(df, vars, id_vars, rsp_var) {
keep_feats <- c(NULL)
for (var in vars) {
dates_df <- df[, id_vars, FALSE]
dates_df[, rsp_var] <- df[, rsp_var, FALSE]
#dates_df <- data.frame(.date=strptime(df[, var], "%Y-%m-%d %H:%M:%S"))
dates_df <- cbind(dates_df, data.frame(.date=strptime(df[, var],
glb_date_fmts[[var]], tz=glb_date_tzs[[var]])))
# print(dates_df[is.na(dates_df$.date), c("ID", "Arrest.fctr", ".date")])
# print(glb_allobs_df[is.na(dates_df$.date), c("ID", "Arrest.fctr", "Date")])
# print(head(glb_allobs_df[grepl("4/7/02 .:..", glb_allobs_df$Date), c("ID", "Arrest.fctr", "Date")]))
# print(head(strptime(glb_allobs_df[grepl("4/7/02 .:..", glb_allobs_df$Date), "Date"], "%m/%e/%y %H:%M"))
# Wrong data during EST->EDT transition
# tmp <- strptime("4/7/02 2:00","%m/%e/%y %H:%M:%S"); print(tmp); print(is.na(tmp))
# dates_df[dates_df$ID == 2068197, .date] <- tmp
# grep("(.*?) 2:(.*)", glb_allobs_df[is.na(dates_df$.date), "Date"], value=TRUE)
# dates_df[is.na(dates_df$.date), ".date"] <-
# data.frame(.date=strptime(gsub("(.*?) 2:(.*)", "\\1 3:\\2",
# glb_allobs_df[is.na(dates_df$.date), "Date"]), "%m/%e/%y %H:%M"))$.date
if (sum(is.na(dates_df$.date)) > 0) {
stop("NA POSIX dates for ", var)
print(df[is.na(dates_df$.date), c(id_vars, rsp_var, var)])
}
.date <- dates_df$.date
dates_df[, paste0(var, ".POSIX")] <- .date
dates_df[, paste0(var, ".year")] <- as.numeric(format(.date, "%Y"))
dates_df[, paste0(var, ".year.fctr")] <- as.factor(format(.date, "%Y"))
dates_df[, paste0(var, ".month")] <- as.numeric(format(.date, "%m"))
dates_df[, paste0(var, ".month.fctr")] <- as.factor(format(.date, "%m"))
dates_df[, paste0(var, ".date")] <- as.numeric(format(.date, "%d"))
dates_df[, paste0(var, ".date.fctr")] <-
cut(as.numeric(format(.date, "%d")), 5) # by month week
dates_df[, paste0(var, ".juliandate")] <- as.numeric(format(.date, "%j"))
# wkday Sun=0; Mon=1; ...; Sat=6
dates_df[, paste0(var, ".wkday")] <- as.numeric(format(.date, "%w"))
dates_df[, paste0(var, ".wkday.fctr")] <- as.factor(format(.date, "%w"))
# Get US Federal Holidays for relevant years
require(XML)
doc.html = htmlTreeParse('http://about.usps.com/news/events-calendar/2012-federal-holidays.htm', useInternal = TRUE)
# # Extract all the paragraphs (HTML tag is p, starting at
# # the root of the document). Unlist flattens the list to
# # create a character vector.
# doc.text = unlist(xpathApply(doc.html, '//p', xmlValue))
# # Replace all \n by spaces
# doc.text = gsub('\\n', ' ', doc.text)
# # Join all the elements of the character vector into a single
# # character string, separated by spaces
# doc.text = paste(doc.text, collapse = ' ')
# parse the tree by tables
txt <- unlist(strsplit(xpathSApply(doc.html, "//*/table", xmlValue), "\n"))
# do some clean up with regular expressions
txt <- grep("day, ", txt, value=TRUE)
txt <- trimws(gsub("(.*?)day, (.*)", "\\2", txt))
# txt <- gsub("\t","",txt)
# txt <- sub("^[[:space:]]*(.*?)[[:space:]]*$", "\\1", txt, perl=TRUE)
# txt <- txt[!(txt %in% c("", "|"))]
hldays <- strptime(paste(txt, ", 2012", sep=""), "%B %e, %Y")
dates_df[, paste0(var, ".hlday")] <-
ifelse(format(.date, "%Y-%m-%d") %in% hldays, 1, 0)
# NYState holidays 1.9., 13.10., 11.11., 27.11., 25.12.
dates_df[, paste0(var, ".wkend")] <- as.numeric(
(dates_df[, paste0(var, ".wkday")] %in% c(0, 6)) |
dates_df[, paste0(var, ".hlday")] )
dates_df[, paste0(var, ".hour")] <- as.numeric(format(.date, "%H"))
dates_df[, paste0(var, ".hour.fctr")] <-
if (length(unique(vals <- as.numeric(format(.date, "%H")))) <= 1)
vals else cut(vals, 3) # by work-shift
dates_df[, paste0(var, ".minute")] <- as.numeric(format(.date, "%M"))
dates_df[, paste0(var, ".minute.fctr")] <-
if (length(unique(vals <- as.numeric(format(.date, "%M")))) <= 1)
vals else cut(vals, 4) # by quarter-hours
dates_df[, paste0(var, ".second")] <- as.numeric(format(.date, "%S"))
dates_df[, paste0(var, ".second.fctr")] <-
if (length(unique(vals <- as.numeric(format(.date, "%S")))) <= 1)
vals else cut(vals, 4) # by quarter-minutes
dates_df[, paste0(var, ".day.minutes")] <-
60 * dates_df[, paste0(var, ".hour")] +
dates_df[, paste0(var, ".minute")]
if ((unq_vals_n <- length(unique(dates_df[, paste0(var, ".day.minutes")]))) > 1) {
max_degree <- min(unq_vals_n, 5)
dates_df[, paste0(var, ".day.minutes.poly.", 1:max_degree)] <-
as.matrix(poly(dates_df[, paste0(var, ".day.minutes")], max_degree))
} else max_degree <- 0
# print(gp <- myplot_box(df=dates_df, ycol_names="PubDate.day.minutes",
# xcol_name=rsp_var))
# print(gp <- myplot_scatter(df=dates_df, xcol_name=".rownames",
# ycol_name="PubDate.day.minutes", colorcol_name=rsp_var))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name="PubDate.day.minutes.poly.1", colorcol_name=rsp_var))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.day.minutes",
# ycol_name="PubDate.day.minutes.poly.4", colorcol_name=rsp_var))
#
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name="PubDate.day.minutes", colorcol_name=rsp_var, smooth=TRUE))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name="PubDate.day.minutes.poly.4", colorcol_name=rsp_var, smooth=TRUE))
# print(gp <- myplot_scatter(df=dates_df, xcol_name="PubDate.juliandate",
# ycol_name=c("PubDate.day.minutes", "PubDate.day.minutes.poly.4"),
# colorcol_name=rsp_var))
# print(gp <- myplot_scatter(df=subset(dates_df, Popular.fctr=="Y"),
# xcol_name=paste0(var, ".juliandate"),
# ycol_name=paste0(var, ".day.minutes", colorcol_name=rsp_var))
# print(gp <- myplot_box(df=dates_df, ycol_names=paste0(var, ".hour"),
# xcol_name=rsp_var))
# print(gp <- myplot_bar(df=dates_df, ycol_names=paste0(var, ".hour.fctr"),
# xcol_name=rsp_var,
# colorcol_name=paste0(var, ".hour.fctr")))
keep_feats <- paste(var,
c(".POSIX", ".year.fctr", ".month.fctr", ".date.fctr", ".wkday.fctr",
".wkend", ".hour.fctr", ".minute.fctr", ".second.fctr"), sep="")
if (max_degree > 0)
keep_feats <- union(keep_feats, paste(var,
paste0(".day.minutes.poly.", 1:max_degree), sep=""))
keep_feats <- intersect(keep_feats, names(dates_df))
}
#myprint_df(dates_df)
return(dates_df[, keep_feats])
}
if (!is.null(glb_date_vars)) {
glb_allobs_df <- cbind(glb_allobs_df,
myextract_dates_df(df=glb_allobs_df, vars=glb_date_vars,
id_vars=glb_id_var, rsp_var=glb_rsp_var))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
paste(glb_date_vars, c("", ".POSIX"), sep=""))
for (feat in glb_date_vars) {
glb_allobs_df <- orderBy(reformulate(paste0(feat, ".POSIX")), glb_allobs_df)
# print(myplot_scatter(glb_allobs_df, xcol_name=paste0(feat, ".POSIX"),
# ycol_name=glb_rsp_var, colorcol_name=glb_rsp_var))
print(myplot_scatter(glb_allobs_df[glb_allobs_df[, paste0(feat, ".POSIX")] >=
strptime("2012-12-01", "%Y-%m-%d"), ],
xcol_name=paste0(feat, ".POSIX"),
ycol_name=glb_rsp_var, colorcol_name=paste0(feat, ".wkend")))
# Create features that measure the gap between previous timestamp in the data
require(zoo)
z <- zoo(as.numeric(as.POSIXlt(glb_allobs_df[, paste0(feat, ".POSIX")])))
glb_allobs_df[, paste0(feat, ".zoo")] <- z
print(head(glb_allobs_df[, c(glb_id_var, feat, paste0(feat, ".zoo"))]))
print(myplot_scatter(glb_allobs_df[glb_allobs_df[, paste0(feat, ".POSIX")] >
strptime("2012-10-01", "%Y-%m-%d"), ],
xcol_name=paste0(feat, ".zoo"), ycol_name=glb_rsp_var,
colorcol_name=glb_rsp_var))
b <- zoo(, seq(nrow(glb_allobs_df)))
last1 <- as.numeric(merge(z-lag(z, -1), b, all=TRUE)); last1[is.na(last1)] <- 0
glb_allobs_df[, paste0(feat, ".last1.log")] <- log(1 + last1)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last1.log")] > 0, ],
ycol_names=paste0(feat, ".last1.log"),
xcol_name=glb_rsp_var))
last10 <- as.numeric(merge(z-lag(z, -10), b, all=TRUE)); last10[is.na(last10)] <- 0
glb_allobs_df[, paste0(feat, ".last10.log")] <- log(1 + last10)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last10.log")] > 0, ],
ycol_names=paste0(feat, ".last10.log"),
xcol_name=glb_rsp_var))
last100 <- as.numeric(merge(z-lag(z, -100), b, all=TRUE)); last100[is.na(last100)] <- 0
glb_allobs_df[, paste0(feat, ".last100.log")] <- log(1 + last100)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last100.log")] > 0, ],
ycol_names=paste0(feat, ".last100.log"),
xcol_name=glb_rsp_var))
glb_allobs_df <- orderBy(reformulate(glb_id_var), glb_allobs_df)
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
c(paste0(feat, ".zoo")))
# all2$last3 = as.numeric(merge(z-lag(z, -3), b, all = TRUE))
# all2$last5 = as.numeric(merge(z-lag(z, -5), b, all = TRUE))
# all2$last10 = as.numeric(merge(z-lag(z, -10), b, all = TRUE))
# all2$last20 = as.numeric(merge(z-lag(z, -20), b, all = TRUE))
# all2$last50 = as.numeric(merge(z-lag(z, -50), b, all = TRUE))
#
#
# # order table
# all2 = all2[order(all2$id),]
#
# ## fill in NAs
# # count averages
# na.avg = all2 %>% group_by(weekend, hour) %>% dplyr::summarise(
# last1=mean(last1, na.rm=TRUE),
# last3=mean(last3, na.rm=TRUE),
# last5=mean(last5, na.rm=TRUE),
# last10=mean(last10, na.rm=TRUE),
# last20=mean(last20, na.rm=TRUE),
# last50=mean(last50, na.rm=TRUE)
# )
#
# # fill in averages
# na.merge = merge(all2, na.avg, by=c("weekend","hour"))
# na.merge = na.merge[order(na.merge$id),]
# for(i in c("last1", "last3", "last5", "last10", "last20", "last50")) {
# y = paste0(i, ".y")
# idx = is.na(all2[[i]])
# all2[idx,][[i]] <- na.merge[idx,][[y]]
# }
# rm(na.avg, na.merge, b, i, idx, n, pd, sec, sh, y, z)
}
}
# check distribution of all numeric data
dsp_numeric_feats_dstrb <- function(feats_vctr) {
for (feat in feats_vctr) {
print(sprintf("feat: %s", feat))
if (glb_is_regression)
gp <- myplot_scatter(df=glb_allobs_df, ycol_name=glb_rsp_var, xcol_name=feat,
smooth=TRUE)
if (glb_is_classification)
gp <- myplot_box(df=glb_allobs_df, ycol_names=feat, xcol_name=glb_rsp_var)
if (inherits(glb_allobs_df[, feat], "factor"))
gp <- gp + facet_wrap(reformulate(feat))
print(gp)
}
}
# dsp_numeric_vars_dstrb(setdiff(names(glb_allobs_df),
# union(myfind_chr_cols_df(glb_allobs_df),
# c(glb_rsp_var_raw, glb_rsp_var))))
add_new_diag_feats <- function(obs_df, ref_df=glb_allobs_df) {
require(plyr)
obs_df <- mutate(obs_df,
# <col_name>.NA=is.na(<col_name>),
# <col_name>.fctr=factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# <col_name>.fctr=relevel(factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# "<ref_val>"),
# <col2_name>.fctr=relevel(factor(ifelse(<col1_name> == <val>, "<oth_val>", "<ref_val>")),
# as.factor(c("R", "<ref_val>")),
# ref="<ref_val>"),
# This doesn't work - use sapply instead
# <col_name>.fctr_num=grep(<col_name>, levels(<col_name>.fctr)),
#
# Date.my=as.Date(strptime(Date, "%m/%d/%y %H:%M")),
# Year=year(Date.my),
# Month=months(Date.my),
# Weekday=weekdays(Date.my)
# <col_name>=<table>[as.character(<col2_name>)],
# <col_name>=as.numeric(<col2_name>),
# <col_name> = trunc(<col2_name> / 100),
.rnorm = rnorm(n=nrow(obs_df))
)
# If levels of a factor are different across obs_df & glb_newobs_df; predict.glm fails
# Transformations not handled by mutate
# obs_df$<col_name>.fctr.num <- sapply(1:nrow(obs_df),
# function(row_ix) grep(obs_df[row_ix, "<col_name>"],
# levels(obs_df[row_ix, "<col_name>.fctr"])))
#print(summary(obs_df))
#print(sapply(names(obs_df), function(col) sum(is.na(obs_df[, col]))))
return(obs_df)
}
glb_allobs_df <- add_new_diag_feats(glb_allobs_df)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#stop(here"); sav_allobs_df <- glb_allobs_df # glb_allobs_df <- sav_allobs_df
# Merge some <descriptor>
# glb_allobs_df$<descriptor>.my <- glb_allobs_df$<descriptor>
# glb_allobs_df[grepl("\\bAIRPORT\\b", glb_allobs_df$<descriptor>.my),
# "<descriptor>.my"] <- "AIRPORT"
# glb_allobs_df$<descriptor>.my <-
# plyr::revalue(glb_allobs_df$<descriptor>.my, c(
# "ABANDONED BUILDING" = "OTHER",
# "##" = "##"
# ))
# print(<descriptor>_freq_df <- mycreate_sqlxtab_df(glb_allobs_df, c("<descriptor>.my")))
# # print(dplyr::filter(<descriptor>_freq_df, grepl("(MEDICAL|DENTAL|OFFICE)", <descriptor>.my)))
# # print(dplyr::filter(dplyr::select(glb_allobs_df, -<var.zoo>),
# # grepl("STORE", <descriptor>.my)))
# glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features, "<descriptor>")
# Check distributions of newly transformed / extracted vars
# Enhancement: remove vars that were displayed ealier
dsp_numeric_feats_dstrb(feats_vctr=setdiff(names(glb_allobs_df),
c(myfind_chr_cols_df(glb_allobs_df), glb_rsp_var_raw, glb_rsp_var,
glb_exclude_vars_as_features)))
## [1] "feat: .rnorm"
# Convert factors to dummy variables
# Build splines require(splines); bsBasis <- bs(training$age, df=3)
#pairs(subset(glb_trnobs_df, select=-c(col_symbol)))
# Check for glb_newobs_df & glb_trnobs_df features range mismatches
# Other diagnostics:
# print(subset(glb_trnobs_df, <col1_name> == max(glb_trnobs_df$<col1_name>, na.rm=TRUE) &
# <col2_name> <= mean(glb_trnobs_df$<col1_name>, na.rm=TRUE)))
# print(glb_trnobs_df[which.max(glb_trnobs_df$<col_name>),])
# print(<col_name>_freq_glb_trnobs_df <- mycreate_tbl_df(glb_trnobs_df, "<col_name>"))
# print(which.min(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>)[, 2]))
# print(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>))
# print(table(is.na(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(table(sign(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(mycreate_xtab_df(glb_trnobs_df, <col1_name>))
# print(mycreate_xtab_df(glb_trnobs_df, c(<col1_name>, <col2_name>)))
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mycreate_xtab_df(glb_trnobs_df, c("<col1_name>", "<col2_name>")))
# <col1_name>_<col2_name>_xtab_glb_trnobs_df[is.na(<col1_name>_<col2_name>_xtab_glb_trnobs_df)] <- 0
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mutate(<col1_name>_<col2_name>_xtab_glb_trnobs_df,
# <col3_name>=(<col1_name> * 1.0) / (<col1_name> + <col2_name>)))
# print(mycreate_sqlxtab_df(glb_allobs_df, c("<col1_name>", "<col2_name>")))
# print(<col2_name>_min_entity_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>, min, na.rm=TRUE)))
# print(<col1_name>_na_by_<col2_name>_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>.NA, glb_trnobs_df$<col2_name>, mean, na.rm=TRUE)))
# Other plots:
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>"))
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>", xcol_name="<col2_name>"))
# print(myplot_line(subset(glb_trnobs_df, Symbol %in% c("CocaCola", "ProcterGamble")),
# "Date.POSIX", "StockPrice", facet_row_colnames="Symbol") +
# geom_vline(xintercept=as.numeric(as.POSIXlt("2003-03-01"))) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1983-01-01")))
# )
# print(myplot_line(subset(glb_trnobs_df, Date.POSIX > as.POSIXct("2004-01-01")),
# "Date.POSIX", "StockPrice") +
# geom_line(aes(color=Symbol)) +
# coord_cartesian(xlim=c(as.POSIXct("1990-01-01"),
# as.POSIXct("2000-01-01"))) +
# coord_cartesian(ylim=c(0, 250)) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1997-09-01"))) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1997-11-01")))
# )
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", colorcol_name="<Pred.fctr>") +
# geom_point(data=subset(glb_allobs_df, <condition>),
# mapping=aes(x=<x_var>, y=<y_var>), color="red", shape=4, size=5) +
# geom_vline(xintercept=84))
rm(last1, last10, last100)
## Warning in rm(last1, last10, last100): object 'last1' not found
## Warning in rm(last1, last10, last100): object 'last10' not found
## Warning in rm(last1, last10, last100): object 'last100' not found
glb_chunks_df <- myadd_chunk(glb_chunks_df, "scrub.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 2 inspect.data 2 0 13.768 19.736 5.968
## 3 scrub.data 2 1 19.737 NA NA
2.1: scrub data# Options:
# 1. Not fill missing vars
# 2. Fill missing numerics with a different algorithm
# 3. Fill missing chars with data based on clusters
mycheck_problem_data(glb_allobs_df)
## [1] "numeric data missing in glb_allobs_df: "
## dirty dirty.fctr
## 200 200
## [1] "numeric data w/ 0s in glb_allobs_df: "
## dirty
## 273
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## review .rownames
## 0 0
# if (!is.null(glb_force_0_to_NA_vars)) {
# for (feat in glb_force_0_to_NA_vars) {
# warning("Forcing ", sum(glb_allobs_df[, feat] == 0),
# " obs with ", feat, " 0s to NAs")
# glb_allobs_df[glb_allobs_df[, feat] == 0, feat] <- NA
# }
# }
mycheck_problem_data(glb_allobs_df)
## [1] "numeric data missing in glb_allobs_df: "
## dirty dirty.fctr
## 200 200
## [1] "numeric data w/ 0s in glb_allobs_df: "
## dirty
## 273
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## review .rownames
## 0 0
dsp_catgs <- function() {
print("NewsDesk:")
print(table(glb_allobs_df$NewsDesk))
print("SectionName:")
print(table(glb_allobs_df$SectionName))
print("SubsectionName:")
print(table(glb_allobs_df$SubsectionName))
}
# sel_obs <- function(Popular=NULL,
# NewsDesk=NULL, SectionName=NULL, SubsectionName=NULL,
# Headline.contains=NULL, Snippet.contains=NULL, Abstract.contains=NULL,
# Headline.pfx=NULL, NewsDesk.nb=NULL, .clusterid=NULL, myCategory=NULL,
# perl=FALSE) {
sel_obs <- function(vars_lst) {
tmp_df <- glb_allobs_df
# Does not work for Popular == NAs ???
if (!is.null(Popular)) {
if (is.na(Popular))
tmp_df <- tmp_df[is.na(tmp_df$Popular), ] else
tmp_df <- tmp_df[tmp_df$Popular == Popular, ]
}
if (!is.null(NewsDesk))
tmp_df <- tmp_df[tmp_df$NewsDesk == NewsDesk, ]
if (!is.null(SectionName))
tmp_df <- tmp_df[tmp_df$SectionName == SectionName, ]
if (!is.null(SubsectionName))
tmp_df <- tmp_df[tmp_df$SubsectionName == SubsectionName, ]
if (!is.null(Headline.contains))
tmp_df <-
tmp_df[grep(Headline.contains, tmp_df$Headline, perl=perl), ]
if (!is.null(Snippet.contains))
tmp_df <-
tmp_df[grep(Snippet.contains, tmp_df$Snippet, perl=perl), ]
if (!is.null(Abstract.contains))
tmp_df <-
tmp_df[grep(Abstract.contains, tmp_df$Abstract, perl=perl), ]
if (!is.null(Headline.pfx)) {
if (length(grep("Headline.pfx", names(tmp_df), fixed=TRUE, value=TRUE))
> 0) tmp_df <-
tmp_df[tmp_df$Headline.pfx == Headline.pfx, ] else
warning("glb_allobs_df does not contain Headline.pfx; ignoring that filter")
}
if (!is.null(NewsDesk.nb)) {
if (any(grepl("NewsDesk.nb", names(tmp_df), fixed=TRUE)) > 0)
tmp_df <-
tmp_df[tmp_df$NewsDesk.nb == NewsDesk.nb, ] else
warning("glb_allobs_df does not contain NewsDesk.nb; ignoring that filter")
}
if (!is.null(.clusterid)) {
if (any(grepl(".clusterid", names(tmp_df), fixed=TRUE)) > 0)
tmp_df <-
tmp_df[tmp_df$clusterid == clusterid, ] else
warning("glb_allobs_df does not contain clusterid; ignoring that filter") }
if (!is.null(myCategory)) {
if (!(myCategory %in% names(glb_allobs_df)))
tmp_df <-
tmp_df[tmp_df$myCategory == myCategory, ] else
warning("glb_allobs_df does not contain myCategory; ignoring that filter")
}
return(glb_allobs_df$UniqueID %in% tmp_df$UniqueID)
}
dsp_obs <- function(..., cols=c(NULL), all=FALSE) {
tmp_df <- glb_allobs_df[sel_obs(...),
union(c("UniqueID", "Popular", "myCategory", "Headline"), cols), FALSE]
if(all) { print(tmp_df) } else { myprint_df(tmp_df) }
}
#dsp_obs(Popular=1, NewsDesk="", SectionName="", Headline.contains="Boehner")
# dsp_obs(Popular=1, NewsDesk="", SectionName="")
# dsp_obs(Popular=NA, NewsDesk="", SectionName="")
dsp_tbl <- function(...) {
tmp_entity_df <- glb_allobs_df[sel_obs(...), ]
tmp_tbl <- table(tmp_entity_df$NewsDesk,
tmp_entity_df$SectionName,
tmp_entity_df$SubsectionName,
tmp_entity_df$Popular, useNA="ifany")
#print(names(tmp_tbl))
#print(dimnames(tmp_tbl))
print(tmp_tbl)
}
dsp_hdlxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "Headline", glb_rsp_var)))
#dsp_hdlxtab("(1914)|(1939)")
dsp_catxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# dsp_catxtab("1914)|(1939)")
# dsp_catxtab("19(14|39|64):")
# dsp_catxtab("19..:")
# Create myCategory <- NewsDesk#SectionName#SubsectionName
# Fix some data before merging categories
# glb_allobs_df[sel_obs(Headline.contains="Your Turn:", NewsDesk=""),
# "NewsDesk"] <- "Styles"
# glb_allobs_df[sel_obs(Headline.contains="School", NewsDesk="", SectionName="U.S.",
# SubsectionName=""),
# "SubsectionName"] <- "Education"
# glb_allobs_df[sel_obs(Headline.contains="Today in Small Business:", NewsDesk="Business"),
# "SectionName"] <- "Business Day"
# glb_allobs_df[sel_obs(Headline.contains="Today in Small Business:", NewsDesk="Business"),
# "SubsectionName"] <- "Small Business"
# glb_allobs_df[sel_obs(Headline.contains="Readers Respond:"),
# "SectionName"] <- "Opinion"
# glb_allobs_df[sel_obs(Headline.contains="Readers Respond:"),
# "SubsectionName"] <- "Room For Debate"
# glb_allobs_df[sel_obs(NewsDesk="Business", SectionName="", SubsectionName="", Popular=NA),
# "SubsectionName"] <- "Small Business"
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(7973),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df[sel_obs(NewsDesk="Business", SectionName="", SubsectionName=""),
# "SectionName"] <- "Technology"
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(5076, 5736, 5924, 5911, 6532),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df[sel_obs(SectionName="Health"),
# "NewsDesk"] <- "Science"
# glb_allobs_df[sel_obs(SectionName="Travel"),
# "NewsDesk"] <- "Travel"
#
# glb_allobs_df[sel_obs(SubsectionName="Fashion & Style"),
# "SectionName"] <- ""
# glb_allobs_df[sel_obs(SubsectionName="Fashion & Style"),
# "SubsectionName"] <- ""
# glb_allobs_df[sel_obs(NewsDesk="Styles", SectionName="", SubsectionName="", Popular=1),
# "SectionName"] <- "U.S."
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% c(5486),
# c("UniqueID", "Headline", "myCategory", "NewsDesk", "SectionName", "SubsectionName")])
#
# glb_allobs_df$myCategory <- paste(glb_allobs_df$NewsDesk,
# glb_allobs_df$SectionName,
# glb_allobs_df$SubsectionName,
# sep="#")
# dsp_obs( Headline.contains="Music:"
# #,NewsDesk=""
# #,SectionName=""
# #,SubsectionName="Fashion & Style"
# #,Popular=1 #NA
# ,cols= c("UniqueID", "Headline", "Popular", "myCategory",
# "NewsDesk", "SectionName", "SubsectionName"),
# all=TRUE)
# dsp_obs( Headline.contains="."
# ,NewsDesk=""
# ,SectionName="Opinion"
# ,SubsectionName=""
# #,Popular=1 #NA
# ,cols= c("UniqueID", "Headline", "Popular", "myCategory",
# "NewsDesk", "SectionName", "SubsectionName"),
# all=TRUE)
# Merge some categories
# glb_allobs_df$myCategory <-
# plyr::revalue(glb_allobs_df$myCategory, c(
# "#Business Day#Dealbook" = "Business#Business Day#Dealbook",
# "#Business Day#Small Business" = "Business#Business Day#Small Business",
# "#Crosswords/Games#" = "Business#Crosswords/Games#",
# "Business##" = "Business#Technology#",
# "#Open#" = "Business#Technology#",
# "#Technology#" = "Business#Technology#",
#
# "#Arts#" = "Culture#Arts#",
# "Culture##" = "Culture#Arts#",
#
# "#World#Asia Pacific" = "Foreign#World#Asia Pacific",
# "Foreign##" = "Foreign#World#",
#
# "#N.Y. / Region#" = "Metro#N.Y. / Region#",
#
# "#Opinion#" = "OpEd#Opinion#",
# "OpEd##" = "OpEd#Opinion#",
#
# "#Health#" = "Science#Health#",
# "Science##" = "Science#Health#",
#
# "Styles##" = "Styles##Fashion",
# "Styles#Health#" = "Science#Health#",
# "Styles#Style#Fashion & Style" = "Styles##Fashion",
#
# "#Travel#" = "Travel#Travel#",
#
# "Magazine#Magazine#" = "myOther",
# "National##" = "myOther",
# "National#U.S.#Politics" = "myOther",
# "Sports##" = "myOther",
# "Sports#Sports#" = "myOther",
# "#U.S.#" = "myOther",
#
#
# # "Business##Small Business" = "Business#Business Day#Small Business",
# #
# # "#Opinion#" = "#Opinion#Room For Debate",
# "##" = "##"
# # "Business##" = "Business#Business Day#Dealbook",
# # "Foreign#World#" = "Foreign##",
# # "#Open#" = "Other",
# # "#Opinion#The Public Editor" = "OpEd#Opinion#",
# # "Styles#Health#" = "Styles##",
# # "Styles#Style#Fashion & Style" = "Styles##",
# # "#U.S.#" = "#U.S.#Education",
# ))
# ctgry_xtab_df <- orderBy(reformulate(c("-", ".n")),
# mycreate_sqlxtab_df(glb_allobs_df,
# c("myCategory", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# myprint_df(ctgry_xtab_df)
# write.table(ctgry_xtab_df, paste0(glb_out_pfx, "ctgry_xtab.csv"),
# row.names=FALSE)
# ctgry_cast_df <- orderBy(~ -Y -NA, dcast(ctgry_xtab_df,
# myCategory + NewsDesk + SectionName + SubsectionName ~
# Popular.fctr, sum, value.var=".n"))
# myprint_df(ctgry_cast_df)
# write.table(ctgry_cast_df, paste0(glb_out_pfx, "ctgry_cast.csv"),
# row.names=FALSE)
# print(ctgry_sum_tbl <- table(glb_allobs_df$myCategory, glb_allobs_df[, glb_rsp_var],
# useNA="ifany"))
dsp_chisq.test <- function(...) {
sel_df <- glb_allobs_df[sel_obs(...) &
!is.na(glb_allobs_df$Popular), ]
sel_df$.marker <- 1
ref_df <- glb_allobs_df[!is.na(glb_allobs_df$Popular), ]
mrg_df <- merge(ref_df[, c(glb_id_var, "Popular")],
sel_df[, c(glb_id_var, ".marker")], all.x=TRUE)
mrg_df[is.na(mrg_df)] <- 0
print(mrg_tbl <- table(mrg_df$.marker, mrg_df$Popular))
print("Rows:Selected; Cols:Popular")
#print(mrg_tbl)
print(chisq.test(mrg_tbl))
}
# dsp_chisq.test(Headline.contains="[Ee]bola")
# dsp_chisq.test(Snippet.contains="[Ee]bola")
# dsp_chisq.test(Abstract.contains="[Ee]bola")
# print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains="[Ee]bola"), ],
# c(glb_rsp_var, "NewsDesk", "SectionName", "SubsectionName")))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName))
# print(table(glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
# glb_allobs_df$myCategory.fctr <- as.factor(glb_allobs_df$myCategory)
# glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
# c("myCategory", "NewsDesk", "SectionName", "SubsectionName"))
# Copy Headline into Snipper & Abstract if they are empty
# print(glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, c("Headline", "Snippet")])
# print(glb_allobs_df[glb_allobs_df$Headline == glb_allobs_df$Snippet,
# c("UniqueID", "Headline", "Snippet")])
# glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, "Snippet"] <-
# glb_allobs_df[nchar(glb_allobs_df[, "Snippet"]) == 0, "Headline"]
#
# print(glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, c("Headline", "Abstract")])
# print(glb_allobs_df[glb_allobs_df$Headline == glb_allobs_df$Abstract,
# c("UniqueID", "Headline", "Abstract")])
# glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, "Abstract"] <-
# glb_allobs_df[nchar(glb_allobs_df[, "Abstract"]) == 0, "Headline"]
# WordCount_0_df <- subset(glb_allobs_df, WordCount == 0)
# table(WordCount_0_df$Popular, WordCount_0_df$WordCount, useNA="ifany")
# myprint_df(WordCount_0_df[,
# c("UniqueID", "Popular", "WordCount", "Headline")])
2.1: scrub dataglb_chunks_df <- myadd_chunk(glb_chunks_df, "transform.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 3 scrub.data 2 1 19.737 20.465 0.728
## 4 transform.data 2 2 20.465 NA NA
### Mapping dictionary
#sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
if (!is.null(glb_map_vars)) {
for (feat in glb_map_vars) {
map_df <- myimport_data(url=glb_map_urls[[feat]],
comment="map_df",
print_diagn=TRUE)
glb_allobs_df <- mymap_codes(glb_allobs_df, feat, names(map_df)[2],
map_df, map_join_col_name=names(map_df)[1],
map_tgt_col_name=names(map_df)[2])
}
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_map_vars)
}
### Forced Assignments
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (feat in glb_assign_vars) {
new_feat <- paste0(feat, ".my")
print(sprintf("Forced Assignments for: %s -> %s...", feat, new_feat))
glb_allobs_df[, new_feat] <- glb_allobs_df[, feat]
pairs <- glb_assign_pairs_lst[[feat]]
for (pair_ix in 1:length(pairs$from)) {
if (is.na(pairs$from[pair_ix]))
nobs <- nrow(filter(glb_allobs_df,
is.na(eval(parse(text=feat),
envir=glb_allobs_df)))) else
nobs <- sum(glb_allobs_df[, feat] == pairs$from[pair_ix])
#nobs <- nrow(filter(glb_allobs_df, is.na(Married.fctr))) ; print(nobs)
if ((is.na(pairs$from[pair_ix])) && (is.na(pairs$to[pair_ix])))
stop("what are you trying to do ???")
if (is.na(pairs$from[pair_ix]))
glb_allobs_df[is.na(glb_allobs_df[, feat]), new_feat] <-
pairs$to[pair_ix] else
glb_allobs_df[glb_allobs_df[, feat] == pairs$from[pair_ix], new_feat] <-
pairs$to[pair_ix]
print(sprintf(" %s -> %s for %s obs",
pairs$from[pair_ix], pairs$to[pair_ix], format(nobs, big.mark=",")))
}
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_assign_vars)
}
### Transformations using mapping functions
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (feat in glb_transform_vars) {
new_feat <- paste0(feat, glb_transform_lst[[feat]]$sfx)
print(sprintf("Applying mapping function for: %s -> %s...", feat, new_feat))
glb_allobs_df[, new_feat] <- glb_transform_lst[[feat]]$mapfn(glb_allobs_df[, feat])
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_transform_vars)
}
## [1] "Applying mapping function for: review -> review.my..."
### Derivations using mapping functions
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (new_feat in glb_derive_vars) {
print(sprintf("Creating new feature: %s...", new_feat))
args_lst <- NULL
for (arg in glb_derive_lst[[new_feat]]$args)
args_lst[[arg]] <- glb_allobs_df[, arg]
glb_allobs_df[, new_feat] <- do.call(glb_derive_lst[[new_feat]]$mapfn, args_lst)
}
## [1] "Creating new feature: review.niso8859.log..."
2.2: transform dataglb_chunks_df <- myadd_chunk(glb_chunks_df, "manage.missing.data", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 4 transform.data 2 2 20.465 23.948 3.483
## 5 manage.missing.data 2 3 23.948 NA NA
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# glb_trnobs_df <- na.omit(glb_trnobs_df)
# glb_newobs_df <- na.omit(glb_newobs_df)
# df[is.na(df)] <- 0
mycheck_problem_data(glb_allobs_df)
## [1] "numeric data missing in glb_allobs_df: "
## dirty dirty.fctr
## 200 200
## [1] "numeric data w/ 0s in glb_allobs_df: "
## dirty
## 273
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## review .rownames review.my
## 0 0 0
# Not refactored into mydsutils.R since glb_*_df might be reassigned
glb_impute_missing_data <- function() {
require(mice)
set.seed(glb_mice_complete.seed)
inp_impent_df <- glb_allobs_df[, setdiff(names(glb_allobs_df),
union(glb_exclude_vars_as_features, glb_rsp_var))]
print("Summary before imputation: ")
print(summary(inp_impent_df))
out_impent_df <- complete(mice(inp_impent_df))
print(summary(out_impent_df))
# complete(mice()) changes attributes of factors even though values don't change
ret_vars <- sapply(names(out_impent_df),
function(col) ifelse(!identical(out_impent_df[, col], inp_impent_df[, col]),
col, ""))
ret_vars <- ret_vars[ret_vars != ""]
return(out_impent_df[, ret_vars])
}
if (glb_impute_na_data &&
(length(myfind_numerics_missing(glb_allobs_df)) > 0) &&
(ncol(nonna_df <- glb_impute_missing_data()) > 0)) {
for (col in names(nonna_df)) {
glb_allobs_df[, paste0(col, ".nonNA")] <- nonna_df[, col]
glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features, col)
}
}
mycheck_problem_data(glb_allobs_df, terminate = TRUE)
## [1] "numeric data missing in glb_allobs_df: "
## dirty dirty.fctr
## 200 200
## [1] "numeric data w/ 0s in glb_allobs_df: "
## dirty
## 273
## [1] "numeric data w/ Infs in glb_allobs_df: "
## named integer(0)
## [1] "numeric data w/ NaNs in glb_allobs_df: "
## named integer(0)
## [1] "string data missing in glb_allobs_df: "
## review .rownames review.my
## 0 0 0
2.3: manage missing data#```{r extract_features, cache=FALSE, eval=!is.null(glb_txt_vars)}
glb_chunks_df <- myadd_chunk(glb_chunks_df, "extract.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 5 manage.missing.data 2 3 23.948 24.022 0.074
## 6 extract.features 3 0 24.022 NA NA
extract.features_chunk_df <- myadd_chunk(NULL, "extract.features_bgn")
## label step_major step_minor bgn end elapsed
## 1 extract.features_bgn 1 0 24.029 NA NA
# Options:
# Select Tf, log(1 + Tf), Tf-IDF or BM25Tf-IDf
# Create new features that help prediction
# <col_name>.lag.2 <- lag(zoo(glb_trnobs_df$<col_name>), -2, na.pad=TRUE)
# glb_trnobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# <col_name>.lag.2 <- lag(zoo(glb_newobs_df$<col_name>), -2, na.pad=TRUE)
# glb_newobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
#
# glb_newobs_df[1, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df) - 1,
# "<col_name>"]
# glb_newobs_df[2, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df),
# "<col_name>"]
# glb_allobs_df <- mutate(glb_allobs_df,
# A.P.http=ifelse(grepl("http",Added,fixed=TRUE), 1, 0)
# )
#
# glb_trnobs_df <- mutate(glb_trnobs_df,
# )
#
# glb_newobs_df <- mutate(glb_newobs_df,
# )
# Create factors of string variables
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "factorize.str.vars"), major.inc=TRUE)
## label step_major step_minor bgn end
## 1 extract.features_bgn 1 0 24.029 24.039
## 2 extract.features_factorize.str.vars 2 0 24.039 NA
## elapsed
## 1 0.01
## 2 NA
#stop(here"); sav_allobs_df <- glb_allobs_df; #glb_allobs_df <- sav_allobs_df
print(str_vars <- myfind_chr_cols_df(glb_allobs_df))
## review .rownames .src review.my
## "review" ".rownames" ".src" "review.my"
if (length(str_vars <- setdiff(str_vars,
c(glb_exclude_vars_as_features, glb_txt_vars))) > 0) {
for (var in str_vars) {
warning("Creating factors of string variable: ", var,
": # of unique values: ", length(unique(glb_allobs_df[, var])))
glb_allobs_df[, paste0(var, ".fctr")] <- factor(glb_allobs_df[, var],
as.factor(unique(glb_allobs_df[, var])))
# glb_trnobs_df[, paste0(var, ".fctr")] <- factor(glb_trnobs_df[, var],
# as.factor(unique(glb_allobs_df[, var])))
# glb_newobs_df[, paste0(var, ".fctr")] <- factor(glb_newobs_df[, var],
# as.factor(unique(glb_allobs_df[, var])))
}
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, str_vars)
}
if (!is.null(glb_txt_vars)) {
require(foreach)
require(gsubfn)
require(stringr)
require(tm)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "process.text"), major.inc=TRUE)
chk_pattern_freq <- function(re_str, ignore.case=TRUE) {
match_mtrx <- str_extract_all(txt_vctr, regex(re_str, ignore_case=ignore.case),
simplify=TRUE)
match_df <- as.data.frame(match_mtrx[match_mtrx != ""])
names(match_df) <- "pattern"
return(mycreate_sqlxtab_df(match_df, "pattern"))
}
#tmp_freq_df <- chk_pattern_freq("\\bNew (\\w)+", ignore.case=FALSE)
#subset(chk_pattern_freq("\\bNew (\\w)+", ignore.case=FALSE), grepl("New [[:upper:]]", pattern))
#chk_pattern_freq("\\bnew (\\W)+")
chk_subfn <- function(pos_ix) {
re_str <- gsubfn_args_lst[["re_str"]][[pos_ix]]
print("re_str:"); print(re_str)
rp_frmla <- gsubfn_args_lst[["rp_frmla"]][[pos_ix]]
print("rp_frmla:"); print(rp_frmla, showEnv=FALSE)
tmp_vctr <- grep(re_str, txt_vctr, value=TRUE, ignore.case=TRUE)[1:5]
print("Before:")
print(tmp_vctr)
print("After:")
print(gsubfn(re_str, rp_frmla, tmp_vctr, ignore.case=TRUE))
}
#chk_subfn(1)
myapply_gsub <- function(...) {
if ((length_lst <- length(names(gsub_map_lst))) == 0)
return(txt_vctr)
for (ptn_ix in 1:length_lst) {
if ((ptn_ix %% 10) == 0)
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
length(names(gsub_map_lst)), names(gsub_map_lst)[ptn_ix]))
txt_vctr <- gsub(names(gsub_map_lst)[ptn_ix], gsub_map_lst[[ptn_ix]],
txt_vctr, ...)
}
return(txt_vctr)
}
myapply_txtmap <- function(txt_vctr, ...) {
nrows <- nrow(glb_txt_map_df)
for (ptn_ix in 1:nrows) {
if ((ptn_ix %% 10) == 0)
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
nrows, glb_txt_map_df[ptn_ix, "rex_str"]))
txt_vctr <- gsub(glb_txt_map_df[ptn_ix, "rex_str"],
glb_txt_map_df[ptn_ix, "rpl_str"],
txt_vctr, ...)
}
return(txt_vctr)
}
chk.equal <- function(bgn, end) {
print(all.equal(sav_txt_lst[["Headline"]][bgn:end], glb_txt_lst[["Headline"]][bgn:end]))
}
dsp.equal <- function(bgn, end) {
print(sav_txt_lst[["Headline"]][bgn:end])
print(glb_txt_lst[["Headline"]][bgn:end])
}
#sav_txt_lst <- glb_txt_lst; all.equal(sav_txt_lst, glb_txt_lst)
#all.equal(sav_txt_lst[["Headline"]][1:4200], glb_txt_lst[["Headline"]][1:4200])
#all.equal(sav_txt_lst[["Headline"]][1:2000], glb_txt_lst[["Headline"]][1:2000])
#all.equal(sav_txt_lst[["Headline"]][1:1000], glb_txt_lst[["Headline"]][1:1000])
#all.equal(sav_txt_lst[["Headline"]][1:500], glb_txt_lst[["Headline"]][1:500])
#all.equal(sav_txt_lst[["Headline"]][1:200], glb_txt_lst[["Headline"]][1:200])
#all.equal(sav_txt_lst[["Headline"]][1:100], glb_txt_lst[["Headline"]][1:100])
#chk.equal( 1, 100)
#chk.equal(51, 100)
#chk.equal(81, 100)
#chk.equal(81, 90)
#chk.equal(81, 85)
#chk.equal(86, 90)
#chk.equal(96, 100)
#dsp.equal(86, 90)
glb_txt_map_df <- read.csv("mytxt_map.csv", comment.char="#", strip.white=TRUE)
glb_txt_lst <- list();
print(sprintf("Building glb_txt_lst..."))
glb_txt_lst <- foreach(txt_var=glb_txt_vars) %dopar% {
# for (txt_var in glb_txt_vars) {
txt_vctr <- glb_allobs_df[, txt_var]
# myapply_txtmap shd be created as a tm_map::content_transformer ?
#print(glb_txt_map_df)
#txt_var=glb_txt_vars[3]; txt_vctr <- glb_txt_lst[[txt_var]]
#print(rex_str <- glb_txt_map_df[1, "rex_str"])
#print(rex_str <- glb_txt_map_df[glb_txt_map_df$rex_str == "\\bWall St\\.", "rex_str"])
#print(rex_str <- glb_txt_map_df[grepl("du Pont", glb_txt_map_df$rex_str), "rex_str"])
#print(rex_str <- glb_txt_map_df[glb_txt_map_df$rpl_str == "versus", "rex_str"])
#print(tmp_vctr <- grep(rex_str, txt_vctr, value=TRUE, ignore.case=FALSE))
#ret_lst <- regexec(rex_str, txt_vctr, ignore.case=FALSE); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
#gsub(rex_str, glb_txt_map_df[glb_txt_map_df$rex_str == rex_str, "rpl_str"], tmp_vctr, ignore.case=FALSE)
#grep("Hong Hong", txt_vctr, value=TRUE)
txt_vctr <- myapply_txtmap(txt_vctr, ignore.case=FALSE)
}
names(glb_txt_lst) <- glb_txt_vars
dsp_matches <- function(rex_str, ix) {
print(match_pos <- gregexpr(rex_str, txt_vctr[ix]))
print(str_sub(txt_vctr[ix], (match_pos[[1]] / 100) * 99 + 0,
(match_pos[[1]] / 100) * 100 + 100))
}
#stop(here")
for (txt_var in glb_txt_vars) {
print(sprintf("Remaining Acronyms in %s:", txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
match_lst <- gregexpr("([[:upper:]]\\.( *)){2,}", txt_vctr)
match_lst <- regmatches(txt_vctr, match_lst)
match_df <- data.frame(matches=sapply(match_lst,
function (elems) paste(elems, collapse="#")))
match_df <- subset(match_df, matches != "")
print(match_df)
#dsp_matches(rex_str="([[:upper:]]\\.( *)){2,}", ix=107)
#dsp_matches(rex_str="\\bR\\.I\\.P(\\.*)(\\B)", ix=461)
#dsp_matches(rex_str="\\bR\\.I\\.P(\\.*)", ix=461)
#print(str_sub(txt_vctr[676], 10100, 10200))
#print(str_sub(txt_vctr[74], 1, -1))
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl("( |-)[[:upper:]]", pattern))))
print(" consider cleaning if relevant to problem domain; geography name; .n > 1")
#grep("New G", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Wins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(N|S|E|W|C)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))))
#grep("N Weaver", txt_vctr, value=TRUE, ignore.case=FALSE)
}
for (txt_var in glb_txt_vars) {
re_str <- "\\b(North|South|East|West|Central)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))))
#grep("Central (African|Bankers|Cast|Italy|Role|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("East (Africa|Berlin|London|Poland|Rivals|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("North (American|Korean|West)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("South (Pacific|Street)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Martins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
find_cmpnd_wrds <- function(txt_vctr) {
txt_corpus <- Corpus(VectorSource(txt_vctr))
txt_corpus <- tm_map(txt_corpus, tolower)
txt_corpus <- tm_map(txt_corpus, PlainTextDocument)
txt_corpus <- tm_map(txt_corpus, removePunctuation,
preserve_intra_word_dashes=TRUE)
full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTf))
print(" Full TermMatrix:"); print(full_Tf_DTM)
full_Tf_mtrx <- as.matrix(full_Tf_DTM)
rownames(full_Tf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_Tf_vctr <- colSums(full_Tf_mtrx)
names(full_Tf_vctr) <- dimnames(full_Tf_DTM)[[2]]
#grep("year", names(full_Tf_vctr), value=TRUE)
#which.max(full_Tf_mtrx[, "yearlong"])
full_Tf_df <- as.data.frame(full_Tf_vctr)
names(full_Tf_df) <- "Tf.full"
full_Tf_df$term <- rownames(full_Tf_df)
#full_Tf_df$freq.full <- colSums(full_Tf_mtrx != 0)
full_Tf_df <- orderBy(~ -Tf.full, full_Tf_df)
cmpnd_Tf_df <- full_Tf_df[grep("-", full_Tf_df$term, value=TRUE) ,]
filter_df <- read.csv("mytxt_compound.csv", comment.char="#", strip.white=TRUE)
cmpnd_Tf_df$filter <- FALSE
for (row_ix in 1:nrow(filter_df))
cmpnd_Tf_df[!cmpnd_Tf_df$filter, "filter"] <-
grepl(filter_df[row_ix, "rex_str"],
cmpnd_Tf_df[!cmpnd_Tf_df$filter, "term"], ignore.case=TRUE)
cmpnd_Tf_df <- subset(cmpnd_Tf_df, !filter)
# Bug in tm_map(txt_corpus, removePunctuation, preserve_intra_word_dashes=TRUE) ???
# "net-a-porter" gets converted to "net-aporter"
#grep("net-a-porter", txt_vctr, ignore.case=TRUE, value=TRUE)
#grep("maser-laser", txt_vctr, ignore.case=TRUE, value=TRUE)
#txt_corpus[[which(grepl("net-a-porter", txt_vctr, ignore.case=TRUE))]]
#grep("\\b(across|longer)-(\\w)", cmpnd_Tf_df$term, ignore.case=TRUE, value=TRUE)
#grep("(\\w)-(affected|term)\\b", cmpnd_Tf_df$term, ignore.case=TRUE, value=TRUE)
print(sprintf("nrow(cmpnd_Tf_df): %d", nrow(cmpnd_Tf_df)))
myprint_df(cmpnd_Tf_df)
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "process.text_reporting_compound_terms"), major.inc=FALSE)
for (txt_var in glb_txt_vars) {
print(sprintf("Remaining compound terms in %s: ", txt_var))
txt_vctr <- glb_txt_lst[[txt_var]]
# find_cmpnd_wrds(txt_vctr)
#grep("thirty-five", txt_vctr, ignore.case=TRUE, value=TRUE)
#rex_str <- glb_txt_map_df[grepl("hirty", glb_txt_map_df$rex_str), "rex_str"]
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "build.corpus"), major.inc=TRUE)
glb_corpus_lst <- list()
print(sprintf("Building glb_corpus_lst..."))
glb_corpus_lst <- foreach(txt_var=glb_txt_vars) %dopar% {
# for (txt_var in glb_txt_vars) {
txt_corpus <- Corpus(VectorSource(glb_txt_lst[[txt_var]]))
txt_corpus <- tm_map(txt_corpus, tolower) #nuppr
txt_corpus <- tm_map(txt_corpus, PlainTextDocument)
txt_corpus <- tm_map(txt_corpus, removePunctuation) #npnct<chr_ix>
# txt-corpus <- tm_map(txt_corpus, content_transformer(function(x, pattern) gsub(pattern, "", x))
# Not to be run in production
inspect_terms <- function() {
full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTf))
print(" Full TermMatrix:"); print(full_Tf_DTM)
full_Tf_mtrx <- as.matrix(full_Tf_DTM)
rownames(full_Tf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_Tf_vctr <- colSums(full_Tf_mtrx)
names(full_Tf_vctr) <- dimnames(full_Tf_DTM)[[2]]
#grep("year", names(full_Tf_vctr), value=TRUE)
#which.max(full_Tf_mtrx[, "yearlong"])
full_Tf_df <- as.data.frame(full_Tf_vctr)
names(full_Tf_df) <- "Tf.full"
full_Tf_df$term <- rownames(full_Tf_df)
#full_Tf_df$freq.full <- colSums(full_Tf_mtrx != 0)
full_Tf_df <- orderBy(~ -Tf.full +term, full_Tf_df)
print(myplot_histogram(full_Tf_df, "Tf.full"))
myprint_df(full_Tf_df)
#txt_corpus[[which(grepl("zun", txt_vctr, ignore.case=TRUE))]]
digit_terms_df <- subset(full_Tf_df, grepl("[[:digit:]]", term))
myprint_df(digit_terms_df)
return(full_Tf_df)
}
#print("RemovePunct:"); remove_punct_Tf_df <- inspect_terms()
txt_corpus <- tm_map(txt_corpus, removeWords,
c(glb_append_stop_words[[txt_var]],
stopwords("english"))) #nstopwrds
#print("StoppedWords:"); stopped_words_Tf_df <- inspect_terms()
txt_corpus <- tm_map(txt_corpus, stemDocument) #Features for lost information: Difference/ratio in density of full_TfIdf_DTM ???
#txt_corpus <- tm_map(txt_corpus, content_transformer(stemDocument))
#print("StemmedWords:"); stemmed_words_Tf_df <- inspect_terms()
#stemmed_stopped_Tf_df <- merge(stemmed_words_Tf_df, stopped_words_Tf_df, by="term", all=TRUE, suffixes=c(".stem", ".stop"))
#myprint_df(stemmed_stopped_Tf_df)
#print(subset(stemmed_stopped_Tf_df, grepl("compan", term)))
#glb_corpus_lst[[txt_var]] <- txt_corpus
}
names(glb_corpus_lst) <- glb_txt_vars
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "extract.DTM"), major.inc=TRUE)
glb_full_DTM_lst <- list(); glb_sprs_DTM_lst <- list();
for (txt_var in glb_txt_vars) {
print(sprintf("Extracting TfIDf terms for %s...", txt_var))
txt_corpus <- glb_corpus_lst[[txt_var]]
# full_Tf_DTM <- DocumentTermMatrix(txt_corpus,
# control=list(weighting=weightTf))
full_TfIdf_DTM <- DocumentTermMatrix(txt_corpus,
control=list(weighting=weightTfIdf))
sprs_TfIdf_DTM <- removeSparseTerms(full_TfIdf_DTM,
glb_sprs_thresholds[txt_var])
# glb_full_DTM_lst[[txt_var]] <- full_Tf_DTM
# glb_sprs_DTM_lst[[txt_var]] <- sprs_Tf_DTM
glb_full_DTM_lst[[txt_var]] <- full_TfIdf_DTM
glb_sprs_DTM_lst[[txt_var]] <- sprs_TfIdf_DTM
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "report.DTM"), major.inc=TRUE)
for (txt_var in glb_txt_vars) {
print(sprintf("Reporting TfIDf terms for %s...", txt_var))
full_TfIdf_DTM <- glb_full_DTM_lst[[txt_var]]
sprs_TfIdf_DTM <- glb_sprs_DTM_lst[[txt_var]]
print(" Full TermMatrix:"); print(full_TfIdf_DTM)
full_TfIdf_mtrx <- as.matrix(full_TfIdf_DTM)
rownames(full_TfIdf_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
full_TfIdf_vctr <- colSums(full_TfIdf_mtrx)
names(full_TfIdf_vctr) <- dimnames(full_TfIdf_DTM)[[2]]
#grep("scene", names(full_TfIdf_vctr), value=TRUE)
#which.max(full_TfIdf_mtrx[, "yearlong"])
full_TfIdf_df <- as.data.frame(full_TfIdf_vctr)
names(full_TfIdf_df) <- "TfIdf.full"
full_TfIdf_df$term <- rownames(full_TfIdf_df)
full_TfIdf_df$freq.full <- colSums(full_TfIdf_mtrx != 0)
full_TfIdf_df <- orderBy(~ -TfIdf.full, full_TfIdf_df)
print(" Sparse TermMatrix:"); print(sprs_TfIdf_DTM)
sprs_TfIdf_vctr <- colSums(as.matrix(sprs_TfIdf_DTM))
names(sprs_TfIdf_vctr) <- dimnames(sprs_TfIdf_DTM)[[2]]
sprs_TfIdf_df <- as.data.frame(sprs_TfIdf_vctr)
names(sprs_TfIdf_df) <- "TfIdf.sprs"
sprs_TfIdf_df$term <- rownames(sprs_TfIdf_df)
sprs_TfIdf_df$freq.sprs <- colSums(as.matrix(sprs_TfIdf_DTM) != 0)
sprs_TfIdf_df <- orderBy(~ -TfIdf.sprs, sprs_TfIdf_df)
terms_TfIdf_df <- merge(full_TfIdf_df, sprs_TfIdf_df, all.x=TRUE)
terms_TfIdf_df$in.sprs <- !is.na(terms_TfIdf_df$freq.sprs)
plt_TfIdf_df <- subset(terms_TfIdf_df,
TfIdf.full >= min(terms_TfIdf_df$TfIdf.sprs, na.rm=TRUE))
plt_TfIdf_df$label <- ""
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "label"] <-
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "term"]
glb_important_terms[[txt_var]] <- union(glb_important_terms[[txt_var]],
plt_TfIdf_df[is.na(plt_TfIdf_df$TfIdf.sprs), "term"])
print(myplot_scatter(plt_TfIdf_df, "freq.full", "TfIdf.full",
colorcol_name="in.sprs") +
geom_text(aes(label=label), color="Black", size=3.5))
melt_TfIdf_df <- orderBy(~ -value, melt(terms_TfIdf_df, id.var="term"))
print(ggplot(melt_TfIdf_df, aes(value, color=variable)) + stat_ecdf() +
geom_hline(yintercept=glb_sprs_thresholds[txt_var],
linetype = "dotted"))
melt_TfIdf_df <- orderBy(~ -value,
melt(subset(terms_TfIdf_df, !is.na(TfIdf.sprs)), id.var="term"))
print(myplot_hbar(melt_TfIdf_df, "term", "value",
colorcol_name="variable"))
melt_TfIdf_df <- orderBy(~ -value,
melt(subset(terms_TfIdf_df, is.na(TfIdf.sprs)), id.var="term"))
print(myplot_hbar(head(melt_TfIdf_df, 10), "term", "value",
colorcol_name="variable"))
}
# sav_full_DTM_lst <- glb_full_DTM_lst
# sav_sprs_DTM_lst <- glb_sprs_DTM_lst
# print(identical(sav_glb_corpus_lst, glb_corpus_lst))
# print(all.equal(length(sav_glb_corpus_lst), length(glb_corpus_lst)))
# print(all.equal(names(sav_glb_corpus_lst), names(glb_corpus_lst)))
# print(all.equal(sav_glb_corpus_lst[["Headline"]], glb_corpus_lst[["Headline"]]))
# print(identical(sav_full_DTM_lst, glb_full_DTM_lst))
# print(identical(sav_sprs_DTM_lst, glb_sprs_DTM_lst))
rm(full_TfIdf_mtrx, full_TfIdf_df, melt_TfIdf_df, terms_TfIdf_df)
# Create txt features
if ((length(glb_txt_vars) > 1) &&
(length(unique(pfxs <- sapply(glb_txt_vars,
function(txt) toupper(substr(txt, 1, 1))))) < length(glb_txt_vars)))
stop("Prefixes for corpus freq terms not unique: ", pfxs)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DTM"),
major.inc=TRUE)
for (txt_var in glb_txt_vars) {
print(sprintf("Binding DTM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
txt_X_df <- as.data.frame(as.matrix(glb_sprs_DTM_lst[[txt_var]]))
colnames(txt_X_df) <- paste(txt_var_pfx, ".T.",
make.names(colnames(txt_X_df)), sep="")
rownames(txt_X_df) <- rownames(glb_allobs_df) # warning otherwise
# plt_X_df <- cbind(txt_X_df, glb_allobs_df[, c(glb_id_var, glb_rsp_var)])
# print(myplot_box(df=plt_X_df, ycol_names="H.T.today", xcol_name=glb_rsp_var))
# log_X_df <- log(1 + txt_X_df)
# colnames(log_X_df) <- paste(colnames(txt_X_df), ".log", sep="")
# plt_X_df <- cbind(log_X_df, glb_allobs_df[, c(glb_id_var, glb_rsp_var)])
# print(myplot_box(df=plt_X_df, ycol_names="H.T.today.log", xcol_name=glb_rsp_var))
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df) # TfIdf is normalized
#glb_allobs_df <- cbind(glb_allobs_df, log_X_df) # if using non-normalized metrics
}
#identical(chk_entity_df, glb_allobs_df)
#chk_entity_df <- glb_allobs_df
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DXM"),
major.inc=TRUE)
#sav_allobs_df <- glb_allobs_df
glb_punct_vctr <- c("!", "\"", "#", "\\$", "%", "&", "'",
"\\(|\\)",# "\\(", "\\)",
"\\*", "\\+", ",", "-", "\\.", "/", ":", ";",
"<|>", # "<",
"=",
# ">",
"\\?", "@", "\\[", "\\\\", "\\]", "^", "_", "`",
"\\{", "\\|", "\\}", "~")
txt_X_df <- glb_allobs_df[, c(glb_id_var, ".rnorm"), FALSE]
txt_X_df <- foreach(txt_var=glb_txt_vars, .combine=cbind) %dopar% {
#for (txt_var in glb_txt_vars) {
print(sprintf("Binding DXM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
#txt_X_df <- glb_allobs_df[, c(glb_id_var, ".rnorm"), FALSE]
txt_full_DTM_mtrx <- as.matrix(glb_full_DTM_lst[[txt_var]])
rownames(txt_full_DTM_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
#print(txt_full_DTM_mtrx[txt_full_DTM_mtrx[, "ebola"] != 0, "ebola"])
# Create <txt_var>.T.<term> for glb_important_terms
for (term in glb_important_terms[[txt_var]])
txt_X_df[, paste0(txt_var_pfx, ".T.", make.names(term))] <-
txt_full_DTM_mtrx[, term]
# Create <txt_var>.nwrds.log & .nwrds.unq.log
txt_X_df[, paste0(txt_var_pfx, ".nwrds.log")] <-
log(1 + mycount_pattern_occ("\\w+", glb_txt_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".nwrds.unq.log")] <-
log(1 + rowSums(txt_full_DTM_mtrx != 0))
txt_X_df[, paste0(txt_var_pfx, ".sum.TfIdf")] <-
rowSums(txt_full_DTM_mtrx)
txt_X_df[, paste0(txt_var_pfx, ".ratio.sum.TfIdf.nwrds")] <-
txt_X_df[, paste0(txt_var_pfx, ".sum.TfIdf")] /
(exp(txt_X_df[, paste0(txt_var_pfx, ".nwrds.log")]) - 1)
txt_X_df[is.nan(txt_X_df[, paste0(txt_var_pfx, ".ratio.sum.TfIdf.nwrds")]),
paste0(txt_var_pfx, ".ratio.sum.TfIdf.nwrds")] <- 0
# Create <txt_var>.nchrs.log
txt_X_df[, paste0(txt_var_pfx, ".nchrs.log")] <-
log(1 + mycount_pattern_occ(".", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".nuppr.log")] <-
log(1 + mycount_pattern_occ("[[:upper:]]", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".ndgts.log")] <-
log(1 + mycount_pattern_occ("[[:digit:]]", glb_allobs_df[, txt_var]))
# Create <txt_var>.npnct?.log
# would this be faster if it's iterated over each row instead of
# each created column ???
for (punct_ix in 1:length(glb_punct_vctr)) {
# smp0 <- " "
# smp1 <- "! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~"
# smp2 <- paste(smp1, smp1, sep=" ")
# print(sprintf("Testing %s pattern:", glb_punct_vctr[punct_ix]))
# results <- mycount_pattern_occ(glb_punct_vctr[punct_ix], c(smp0, smp1, smp2))
# names(results) <- NULL; print(results)
txt_X_df[,
paste0(txt_var_pfx, ".npnct", sprintf("%02d", punct_ix), ".log")] <-
log(1 + mycount_pattern_occ(glb_punct_vctr[punct_ix],
glb_allobs_df[, txt_var]))
}
# print(head(glb_allobs_df[glb_allobs_df[, "A.npnct23.log"] > 0,
# c("UniqueID", "Popular", "Abstract", "A.npnct23.log")]))
# Create <txt_var>.nstopwrds.log & <txt_var>ratio.nstopwrds.nwrds
stop_words_rex_str <- paste0("\\b(", paste0(c(glb_append_stop_words[[txt_var]],
stopwords("english")), collapse="|"),
")\\b")
txt_X_df[, paste0(txt_var_pfx, ".nstopwrds", ".log")] <-
log(1 + mycount_pattern_occ(stop_words_rex_str, glb_txt_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".ratio.nstopwrds.nwrds")] <-
exp(txt_X_df[, paste0(txt_var_pfx, ".nstopwrds", ".log")] -
txt_X_df[, paste0(txt_var_pfx, ".nwrds", ".log")])
# Create <txt_var>.P.http
txt_X_df[, paste(txt_var_pfx, ".P.http", sep="")] <-
as.integer(0 + mycount_pattern_occ("http", glb_allobs_df[, txt_var]))
# Create user-specified pattern vectors
# <txt_var>.P.year.colon
txt_X_df[, paste0(txt_var_pfx, ".P.year.colon")] <-
as.integer(0 + mycount_pattern_occ("[0-9]{4}:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.daily.clip.report")] <-
as.integer(0 + mycount_pattern_occ("Daily Clip Report", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.fashion.week")] <-
as.integer(0 + mycount_pattern_occ("Fashion Week", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.first.draft")] <-
as.integer(0 + mycount_pattern_occ("First Draft", glb_allobs_df[, txt_var]))
#sum(mycount_pattern_occ("Metropolitan Diary:", glb_allobs_df$Abstract) > 0)
if (txt_var %in% c("Snippet", "Abstract")) {
txt_X_df[, paste0(txt_var_pfx, ".P.metropolitan.diary.colon")] <-
as.integer(0 + mycount_pattern_occ("Metropolitan Diary:",
glb_allobs_df[, txt_var]))
}
#sum(mycount_pattern_occ("[0-9]{4}:", glb_allobs_df$Headline) > 0)
#sum(mycount_pattern_occ("Quandary(.*)(?=:)", glb_allobs_df$Headline, perl=TRUE) > 0)
#sum(mycount_pattern_occ("No Comment(.*):", glb_allobs_df$Headline) > 0)
#sum(mycount_pattern_occ("Friday Night Music:", glb_allobs_df$Headline) > 0)
if (txt_var %in% c("Headline")) {
txt_X_df[, paste0(txt_var_pfx, ".P.facts.figures")] <-
as.integer(0 + mycount_pattern_occ("Facts & Figures:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.friday.night.music")] <-
as.integer(0 + mycount_pattern_occ("Friday Night Music", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.no.comment.colon")] <-
as.integer(0 + mycount_pattern_occ("No Comment(.*):", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.on.this.day")] <-
as.integer(0 + mycount_pattern_occ("On This Day", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.quandary")] <-
as.integer(0 + mycount_pattern_occ("Quandary(.*)(?=:)", glb_allobs_df[, txt_var], perl=TRUE))
txt_X_df[, paste0(txt_var_pfx, ".P.readers.respond")] <-
as.integer(0 + mycount_pattern_occ("Readers Respond", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.recap.colon")] <-
as.integer(0 + mycount_pattern_occ("Recap:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.s.notebook")] <-
as.integer(0 + mycount_pattern_occ("s Notebook", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.today.in.politic")] <-
as.integer(0 + mycount_pattern_occ("Today in Politic", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.today.in.smallbusiness")] <-
as.integer(0 + mycount_pattern_occ("Today in Small Business:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.verbatim.colon")] <-
as.integer(0 + mycount_pattern_occ("Verbatim:", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".P.what.we.are")] <-
as.integer(0 + mycount_pattern_occ("What We're", glb_allobs_df[, txt_var]))
}
#summary(glb_allobs_df[ ,grep("P.on.this.day", names(glb_allobs_df), value=TRUE)])
txt_X_df <- subset(txt_X_df, select=-.rnorm)
txt_X_df <- txt_X_df[, -grep(glb_id_var, names(txt_X_df), fixed=TRUE), FALSE]
#glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
}
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
#myplot_box(glb_allobs_df, "A.sum.TfIdf", glb_rsp_var)
# Generate summaries
# print(summary(glb_allobs_df))
# print(sapply(names(glb_allobs_df), function(col) sum(is.na(glb_allobs_df[, col]))))
# print(summary(glb_trnobs_df))
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(summary(glb_newobs_df))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
glb_txt_vars)
rm(log_X_df, txt_X_df)
}
## Loading required package: stringr
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
## label step_major step_minor bgn end
## 2 extract.features_factorize.str.vars 2 0 24.039 24.346
## 3 extract.features_process.text 3 0 24.347 NA
## elapsed
## 2 0.307
## 3 NA
## [1] "Building glb_txt_lst..."
## [1] "running gsub for 10 (of 163): #\\bA\\.D\\.D\\.#..."
## [1] "running gsub for 20 (of 163): #\\bB\\.C\\.(\\B)#..."
## [1] "running gsub for 30 (of 163): #\\bC\\.O\\.P\\.D\\.#..."
## [1] "running gsub for 40 (of 163): #\\bETF\\.#..."
## [1] "running gsub for 50 (of 163): #\\bG\\.I\\.#..."
## [1] "running gsub for 60 (of 163): #\\bI\\.S\\.S\\.#..."
## [1] "running gsub for 70 (of 163): #\\bJ\\. Mendel#..."
## [1] "running gsub for 80 (of 163): #\\bN\\.F\\.L\\.#..."
## [1] "running gsub for 90 (of 163): #\\bNow O\\.K\\.#..."
## [1] "running gsub for 100 (of 163): #\\bS\\.B\\.A\\.#..."
## [1] "running gsub for 110 (of 163): #\\bU\\.A\\.R\\.#..."
## [1] "running gsub for 120 (of 163): #\\bW\\.C\\.(\\B)#..."
## [1] "running gsub for 130 (of 163): #\\bHong( |-)((Kong|Kongs|Kongers)\\b)+#..."
## [1] "running gsub for 140 (of 163): #\\b(Russia|Dummy)-(\\w)#..."
## [1] "running gsub for 150 (of 163): #(.+)-(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth)#..."
## [1] "running gsub for 160 (of 163): #\\b[Tt]hirty-[Ff]ive(\\B)#..."
## [1] "Remaining Acronyms in review.my:"
## matches
## 375 B.O.
## 505 O.G.
## 706 K.S.
## [1] "Remaining #\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+# terms in review.my: "
## pattern .n
## 1 New HongKong 15
## 2 San Diego 14
## 3 New Saigon 6
## 4 San Jose 5
## 5 St. Germain 5
## 6 Puerto Vallarta 4
## 7 St. Patrick 4
## 8 Fort St 3
## 9 New Kowloon 3
## 10 St. Patty 3
## 12 New Belgium 2
## 13 New Chefs 2
## 14 New China 2
## 15 New Englander 2
## 17 San Antonio 2
## 18 San Gabriel 2
## 19 San Gimignano 2
## 20 San Marzano 2
## 21 Hong Restaurant 1
## 29 Las Margaritas 1
## 31 Los Angelino 1
## 32 Los Lobos 1
## 33 Los Muertos 1
## 34 Los Pancho 1
## 35 New Girl 1
## 36 New Guinea 1
## 37 New Jerseyite 1
## 38 New Luck 1
## 39 New Neighborhood 1
## 40 New Port 1
## 41 New Seattle 1
## 42 New Wave 1
## 48 San Fran 1
## 49 San Fransisco 1
## 50 San Ganbriel 1
## 51 San Gregorio 1
## 52 San Juans 1
## 53 San Pedro 1
## 55 St. Augustine 1
## 56 St. Bernard 1
## 57 St. Clouds 1
## 58 St. Coffee 1
## 59 St. Gyros 1
## 60 St. It 1
## 61 St. James 1
## 62 St. Michelle 1
## 63 St. Paddy 1
## 64 St. Pat 1
## [1] " consider cleaning if relevant to problem domain; geography name; .n > 1"
## [1] "Remaining #\\b(N|S|E|W|C)( |\\.)(\\w)+# terms in review.my: "
## pattern .n
## 1 N Spice 5
## 2 N Cheese 4
## 3 W Hotel 4
## 4 C and 3
## 5 W hotel 3
## 6 E Olive 2
## 7 N Jerry 2
## 8 S SAKE 2
## 9 W student 2
## 11 C ask 1
## 12 C does 1
## 13 C in 1
## 14 C is 1
## 15 C kicking 1
## 16 C once 1
## 17 C pumping 1
## 10 C Sushi 1
## 18 C with 1
## 19 C y 1
## 20 E Bella 1
## 21 E Cocoa 1
## 23 E its 1
## 22 E L 1
## 24 E sushi 1
## 25 N 10 1
## 26 N 45th 1
## 33 N and 1
## 27 N CHEESE 1
## 34 N delicious 1
## 35 N in 1
## 28 N Jacks 1
## 29 N Not 1
## 30 N Out 1
## 31 N Salsa 1
## 32 N Sour 1
## 36 S A 1
## 37 S ALL 1
## 49 S and 1
## 38 S Crispy 1
## 39 S EGGNOG 1
## 50 S for 1
## 51 S got 1
## 52 S has 1
## 40 S IN 1
## 53 S just 1
## 41 S LACED 1
## 42 S NOM 1
## 43 S NOT 1
## 44 S O 1
## 45 S OVER 1
## 46 S PLACE 1
## 47 S SO 1
## 54 S to 1
## 55 S wanted 1
## 48 S WHAT 1
## 60 W and 1
## 56 W Bar 1
## 61 W for 1
## 62 W glass 1
## 57 W N 1
## 58 W Seattle 1
## 59 W Sexy 1
## 63 W style 1
## 64 W that 1
## 65 W tip 1
## 66 W was 1
## [1] "Remaining #\\b(North|South|East|West|Central)( |\\.)(\\w)+# terms in review.my: "
## pattern .n
## 1 West Seattle 49
## 2 South Lake 13
## 3 West Coast 12
## 4 East Coast 8
## 5 North Seattle 7
## 6 Central District 6
## 7 North African 6
## 8 South Seattle 4
## 9 West 5 3
## 10 Central America 2
## 11 East African 2
## 13 East in 2
## 12 East Indian 2
## 14 North America 2
## 15 North to 2
## 16 South America 2
## 17 South China 2
## 18 South Jackson 2
## 19 West Carribean 2
## 20 Central American 1
## 25 Central and 1
## 21 Central Asia 1
## 26 Central bakery 1
## 22 Central Broken 1
## 23 Central Library 1
## 24 Central Vietnamese 1
## 27 Central.Frat 1
## 28 Central.Hoochie 1
## 29 East German 1
## 32 East is 1
## 30 East LA 1
## 31 East Madison 1
## 33 East meats 1
## 34 East of 1
## 35 East several 1
## 36 East side 1
## 44 North and 1
## 37 North Bend 1
## 38 North Coffee 1
## 45 North end 1
## 39 North Exit 1
## 46 North from 1
## 40 North Greenwood 1
## 41 North Indian 1
## 47 North of 1
## 42 North Rileup 1
## 43 North West 1
## 53 South and 1
## 48 South Bay 1
## 54 South but 1
## 49 South Carolina 1
## 50 South Center 1
## 51 South Main 1
## 55 South on 1
## 52 South Pasadena 1
## 56 West Caribbean 1
## 57 West Virginiia 1
## label step_major
## 3 extract.features_process.text 3
## 4 extract.features_process.text_reporting_compound_terms 3
## step_minor bgn end elapsed
## 3 0 24.347 69.561 45.214
## 4 1 69.561 NA NA
## [1] "Remaining compound terms in review.my: "
## label step_major
## 4 extract.features_process.text_reporting_compound_terms 3
## 5 extract.features_build.corpus 4
## step_minor bgn end elapsed
## 4 1 69.561 69.567 0.006
## 5 0 69.567 NA NA
## [1] "Building glb_corpus_lst..."
## label step_major step_minor bgn end elapsed
## 5 extract.features_build.corpus 4 0 69.567 78.46 8.893
## 6 extract.features_extract.DTM 5 0 78.461 NA NA
## [1] "Extracting TfIDf terms for review.my..."
## label step_major step_minor bgn end elapsed
## 6 extract.features_extract.DTM 5 0 78.461 80.364 1.903
## 7 extract.features_report.DTM 6 0 80.365 NA NA
## [1] "Reporting TfIDf terms for review.my..."
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 746, terms: 30897)>>
## Non-/sparse entries: 275334/22773828
## Sparsity : 99%
## Maximal term length: 47
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## [1] " Sparse TermMatrix:"
## <<DocumentTermMatrix (documents: 746, terms: 12)>>
## Non-/sparse entries: 7068/1884
## Sparsity : 21%
## Maximal term length: 6
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Warning in myplot_scatter(plt_TfIdf_df, "freq.full", "TfIdf.full",
## colorcol_name = "in.sprs"): converting in.sprs to class:factor
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 6 rows containing missing values
## (geom_path).
## label step_major step_minor bgn end elapsed
## 7 extract.features_report.DTM 6 0 80.365 85.112 4.747
## 8 extract.features_bind.DTM 7 0 85.112 NA NA
## [1] "Binding DTM for review.my..."
## label step_major step_minor bgn end elapsed
## 8 extract.features_bind.DTM 7 0 85.112 85.119 0.007
## 9 extract.features_bind.DXM 8 0 85.119 NA NA
## [1] "Binding DXM for review.my..."
## Warning in rm(log_X_df, txt_X_df): object 'log_X_df' not found
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# print(myplot_scatter(glb_trnobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
glb_full_DTM_lst, glb_sprs_DTM_lst, txt_corpus, txt_vctr)
## Warning in rm(corpus_lst, full_TfIdf_DTM, full_TfIdf_vctr,
## glb_full_DTM_lst, : object 'corpus_lst' not found
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df, "extract.features_end",
major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 9 extract.features_bind.DXM 8 0 85.119 127.898 42.779
## 10 extract.features_end 9 0 127.899 NA NA
myplt_chunk(extract.features_chunk_df)
## label step_major
## 3 extract.features_process.text 3
## 9 extract.features_bind.DXM 8
## 5 extract.features_build.corpus 4
## 7 extract.features_report.DTM 6
## 6 extract.features_extract.DTM 5
## 2 extract.features_factorize.str.vars 2
## 1 extract.features_bgn 1
## 8 extract.features_bind.DTM 7
## 4 extract.features_process.text_reporting_compound_terms 3
## step_minor bgn end elapsed duration
## 3 0 24.347 69.561 45.214 45.214
## 9 0 85.119 127.898 42.779 42.779
## 5 0 69.567 78.460 8.893 8.893
## 7 0 80.365 85.112 4.747 4.747
## 6 0 78.461 80.364 1.903 1.903
## 2 0 24.039 24.346 0.307 0.307
## 1 0 24.029 24.039 0.010 0.010
## 8 0 85.112 85.119 0.007 0.007
## 4 1 69.561 69.567 0.006 0.006
## [1] "Total Elapsed Time: 127.898 secs"
# if (glb_save_envir)
# save(glb_feats_df,
# glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
# file=paste0(glb_out_pfx, "extract_features_dsk.RData"))
# load(paste0(glb_out_pfx, "extract_features_dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all","data.new")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "cluster.data", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 6 extract.features 3 0 24.022 129.336 105.315
## 7 cluster.data 4 0 129.337 NA NA
4.0: cluster dataif (glb_cluster) {
require(proxy)
#require(hash)
require(dynamicTreeCut)
# glb_hash <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
# glb_hash_lst <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
#stophere; sav_allobs_df <- glb_allobs_df;
print("Clustering features: ")
print(cluster_vars <- grep("[HSA]\\.[PT]\\.", names(glb_allobs_df), value=TRUE))
#print(cluster_vars <- grep("[HSA]\\.", names(glb_allobs_df), value=TRUE))
glb_allobs_df$.clusterid <- 1
#print(max(table(glb_allobs_df$myCategory.fctr) / 20))
for (myCategory in c("##", "Business#Business Day#Dealbook", "OpEd#Opinion#",
"Styles#U.S.#", "Business#Technology#", "Science#Health#",
"Culture#Arts#")) {
ctgry_allobs_df <- glb_allobs_df[glb_allobs_df$myCategory == myCategory, ]
dstns_dist <- dist(ctgry_allobs_df[, cluster_vars], method = "cosine")
dstns_mtrx <- as.matrix(dstns_dist)
print(sprintf("max distance(%0.4f) pair:", max(dstns_mtrx)))
row_ix <- ceiling(which.max(dstns_mtrx) / ncol(dstns_mtrx))
col_ix <- which.max(dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c("UniqueID", "Popular", "myCategory", "Headline", cluster_vars)])
min_dstns_mtrx <- dstns_mtrx
diag(min_dstns_mtrx) <- 1
print(sprintf("min distance(%0.4f) pair:", min(min_dstns_mtrx)))
row_ix <- ceiling(which.min(min_dstns_mtrx) / ncol(min_dstns_mtrx))
col_ix <- which.min(min_dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c("UniqueID", "Popular", "myCategory", "Headline", cluster_vars)])
clusters <- hclust(dstns_dist, method = "ward.D2")
#plot(clusters, labels=NULL, hang=-1)
myplclust(clusters, lab.col=unclass(ctgry_allobs_df[, glb_rsp_var]))
#clusterGroups = cutree(clusters, k=7)
clusterGroups <- cutreeDynamic(clusters, minClusterSize=20, method="tree", deepSplit=0)
# Unassigned groups are labeled 0; the largest group has label 1
table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
#print(ctgry_allobs_df[which(clusterGroups == 1), c("UniqueID", "Popular", "Headline")])
#print(ctgry_allobs_df[(clusterGroups == 1) & !is.na(ctgry_allobs_df$Popular) & (ctgry_allobs_df$Popular==1), c("UniqueID", "Popular", "Headline")])
clusterGroups[clusterGroups == 0] <- 1
table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
#summary(factor(clusterGroups))
# clusterGroups <- clusterGroups +
# 100 * # has to be > max(table(glb_allobs_df$myCategory.fctr) / minClusterSize=20)
# which(levels(glb_allobs_df$myCategory.fctr) == myCategory)
# table(clusterGroups, ctgry_allobs_df[, glb_rsp_var], useNA="ifany")
# add to glb_allobs_df - then split the data again
glb_allobs_df[glb_allobs_df$myCategory==myCategory,]$.clusterid <- clusterGroups
#print(unique(glb_allobs_df$.clusterid))
#print(glb_feats_df[glb_feats_df$id == ".clusterid.fctr", ])
}
ctgry_xtab_df <- orderBy(reformulate(c("-", ".n")),
mycreate_sqlxtab_df(glb_allobs_df,
c("myCategory", ".clusterid", glb_rsp_var)))
ctgry_cast_df <- orderBy(~ -Y -NA, dcast(ctgry_xtab_df,
myCategory + .clusterid ~
Popular.fctr, sum, value.var=".n"))
print(ctgry_cast_df)
#print(orderBy(~ myCategory -Y -NA, ctgry_cast_df))
# write.table(ctgry_cast_df, paste0(glb_out_pfx, "ctgry_clst.csv"),
# row.names=FALSE)
print(ctgry_sum_tbl <- table(glb_allobs_df$myCategory, glb_allobs_df$.clusterid,
glb_allobs_df[, glb_rsp_var],
useNA="ifany"))
# dsp_obs(.clusterid=1, myCategory="OpEd#Opinion#",
# cols=c("UniqueID", "Popular", "myCategory", ".clusterid", "Headline"),
# all=TRUE)
glb_allobs_df$.clusterid.fctr <- as.factor(glb_allobs_df$.clusterid)
glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features,
".clusterid")
glb_interaction_only_features["myCategory.fctr"] <- c(".clusterid.fctr")
glb_exclude_vars_as_features <- c(glb_exclude_vars_as_features,
cluster_vars)
}
# Re-partition
glb_trnobs_df <- subset(glb_allobs_df, .src == "Train")
glb_newobs_df <- subset(glb_allobs_df, .src == "Test")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "select.features", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 7 cluster.data 4 0 129.337 130.484 1.147
## 8 select.features 5 0 130.484 NA NA
5.0: select featuresprint(glb_feats_df <- myselect_features(entity_df=glb_trnobs_df,
exclude_vars_as_features=glb_exclude_vars_as_features,
rsp_var=glb_rsp_var))
## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## id cor.y
## dirty dirty 1.000000e+00
## R.npnct19.log R.npnct19.log 2.245129e-01
## R.T.dish R.T.dish 1.979787e-01
## R.npnct07.log R.npnct07.log 1.880709e-01
## R.ndgts.log R.ndgts.log 1.827317e-01
## R.T.pork R.T.pork 1.801002e-01
## R.T.noodl R.T.noodl 1.650966e-01
## R.npnct08.log R.npnct08.log 1.640528e-01
## R.npnct01.log R.npnct01.log 1.631087e-01
## review.niso8859.log review.niso8859.log 1.624341e-01
## R.nwrds.log R.nwrds.log 1.610645e-01
## R.nuppr.log R.nuppr.log 1.587833e-01
## R.nwrds.unq.log R.nwrds.unq.log 1.586825e-01
## R.nchrs.log R.nchrs.log 1.586282e-01
## R.npnct13.log R.npnct13.log 1.580031e-01
## R.nstopwrds.log R.nstopwrds.log 1.576995e-01
## R.npnct15.log R.npnct15.log 1.568546e-01
## R.npnct11.log R.npnct11.log 1.527621e-01
## R.npnct12.log R.npnct12.log 1.522268e-01
## R.npnct14.log R.npnct14.log 1.515787e-01
## R.npnct04.log R.npnct04.log 1.474628e-01
## R.npnct02.log R.npnct02.log 1.394844e-01
## R.T.rice R.T.rice 1.331662e-01
## R.T.thai R.T.thai 1.273370e-01
## R.npnct30.log R.npnct30.log 1.269623e-01
## R.npnct05.log R.npnct05.log 1.246365e-01
## R.T.chees R.T.chees -1.178283e-01
## R.T.banh R.T.banh 1.176038e-01
## R.T.sum R.T.sum 1.156674e-01
## R.T.dim R.T.dim 1.107069e-01
## R.T.nice R.T.nice -1.036814e-01
## R.T.salad R.T.salad -1.021960e-01
## R.T.chines R.T.chines 1.003843e-01
## R.npnct09.log R.npnct09.log 9.433979e-02
## R.npnct03.log R.npnct03.log 9.397071e-02
## R.T.like R.T.like -9.225243e-02
## R.T.roll R.T.roll 8.991836e-02
## R.T.chicken R.T.chicken 8.650427e-02
## R.T.bar R.T.bar -8.600445e-02
## .rnorm .rnorm 8.551826e-02
## R.npnct18.log R.npnct18.log 8.253867e-02
## R.ratio.sum.TfIdf.nwrds R.ratio.sum.TfIdf.nwrds -7.780177e-02
## R.T.seattl R.T.seattl 7.491085e-02
## R.npnct21.log R.npnct21.log 7.455775e-02
## R.T.vietnames R.T.vietnames 7.300890e-02
## R.T.cheap R.T.cheap 7.286196e-02
## R.T.bbq R.T.bbq 7.249209e-02
## R.T.park R.T.park 7.237790e-02
## R.npnct10.log R.npnct10.log 7.146157e-02
## R.T.egg R.T.egg 7.006131e-02
## R.T.tofu R.T.tofu 6.974134e-02
## R.T.best R.T.best 6.739375e-02
## R.T.drink R.T.drink -6.633759e-02
## R.P.http R.P.http 6.606126e-02
## R.npnct23.log R.npnct23.log 6.387801e-02
## R.npnct25.log R.npnct25.log 6.290420e-02
## R.T.falafel R.T.falafel 6.231497e-02
## R.T.beer R.T.beer -6.199584e-02
## R.T.happi R.T.happi -6.195589e-02
## R.T.pizza R.T.pizza -6.081540e-02
## R.npnct28.log R.npnct28.log 6.063391e-02
## R.T.just R.T.just -6.035362e-02
## R.T.good R.T.good -6.033749e-02
## R.T.deli R.T.deli 5.759853e-02
## R.T.alway R.T.alway -5.696007e-02
## R.T.soup R.T.soup 5.692613e-02
## R.T.realli R.T.realli -5.191088e-02
## R.T.cake R.T.cake -5.176378e-02
## R.npnct27.log R.npnct27.log 4.868565e-02
## R.npnct29.log R.npnct29.log 4.868565e-02
## R.T.ethiopian R.T.ethiopian -4.833778e-02
## R.T.great R.T.great 4.753125e-02
## R.T.sandwich R.T.sandwich -4.722423e-02
## R.T.burger R.T.burger -4.665500e-02
## R.npnct16.log R.npnct16.log 4.473265e-02
## R.T.get R.T.get 4.418473e-02
## R.T.pretti R.T.pretti -4.360764e-02
## R.T.friend R.T.friend -4.306420e-02
## R.npnct22.log R.npnct22.log 4.283529e-02
## R.npnct26.log R.npnct26.log 4.283529e-02
## R.T.fri R.T.fri 4.263357e-02
## R.T.delici R.T.delici -4.121077e-02
## R.T.mexican R.T.mexican -3.930427e-02
## R.ratio.nstopwrds.nwrds R.ratio.nstopwrds.nwrds -3.904108e-02
## R.T.philli R.T.philli 3.812319e-02
## R.T.price R.T.price -3.811137e-02
## R.T.taco R.T.taco -3.785186e-02
## R.T.night R.T.night -3.731382e-02
## R.T.coffe R.T.coffe -3.731231e-02
## R.T.pho R.T.pho 3.680428e-02
## R.T.love R.T.love 3.525386e-02
## R.T.order R.T.order -3.449862e-02
## R.T.sandwhich R.T.sandwhich -3.447413e-02
## R.T.food R.T.food 3.403065e-02
## R.T.tast R.T.tast -3.250882e-02
## R.T.time R.T.time 3.061799e-02
## R.T.hour R.T.hour -2.929357e-02
## R.T.restaur R.T.restaur 2.703862e-02
## R.sum.TfIdf R.sum.TfIdf -2.553415e-02
## R.npnct06.log R.npnct06.log 2.485017e-02
## R.T.tri R.T.tri 2.382265e-02
## R.T.deliveri R.T.deliveri -2.103602e-02
## R.T.breakfast R.T.breakfast -1.673764e-02
## R.T.sauc R.T.sauc 1.290473e-02
## R.T.sushi R.T.sushi 1.210830e-02
## R.T.servic R.T.servic 1.182398e-02
## R.T.ive R.T.ive 1.057633e-02
## R.T.can R.T.can 8.816060e-03
## R.npnct20.log R.npnct20.log 8.243086e-03
## R.T.menu R.T.menu 7.905538e-03
## R.T.one R.T.one -7.172369e-03
## R.T.teriyaki R.T.teriyaki 6.845139e-03
## R.T.fresh R.T.fresh -3.537235e-03
## R.T.tabl R.T.tabl -3.261586e-03
## R.T.place R.T.place -2.975074e-03
## R.T.lunch R.T.lunch 1.821254e-03
## R.P.year.colon R.P.year.colon -8.303515e-21
## R.npnct17.log R.npnct17.log NA
## R.npnct24.log R.npnct24.log NA
## R.P.daily.clip.report R.P.daily.clip.report NA
## R.P.fashion.week R.P.fashion.week NA
## R.P.first.draft R.P.first.draft NA
## exclude.as.feat cor.y.abs
## dirty 1 1.000000e+00
## R.npnct19.log 0 2.245129e-01
## R.T.dish 0 1.979787e-01
## R.npnct07.log 0 1.880709e-01
## R.ndgts.log 0 1.827317e-01
## R.T.pork 0 1.801002e-01
## R.T.noodl 0 1.650966e-01
## R.npnct08.log 0 1.640528e-01
## R.npnct01.log 0 1.631087e-01
## review.niso8859.log 0 1.624341e-01
## R.nwrds.log 0 1.610645e-01
## R.nuppr.log 0 1.587833e-01
## R.nwrds.unq.log 0 1.586825e-01
## R.nchrs.log 0 1.586282e-01
## R.npnct13.log 0 1.580031e-01
## R.nstopwrds.log 0 1.576995e-01
## R.npnct15.log 0 1.568546e-01
## R.npnct11.log 0 1.527621e-01
## R.npnct12.log 0 1.522268e-01
## R.npnct14.log 0 1.515787e-01
## R.npnct04.log 0 1.474628e-01
## R.npnct02.log 0 1.394844e-01
## R.T.rice 0 1.331662e-01
## R.T.thai 0 1.273370e-01
## R.npnct30.log 0 1.269623e-01
## R.npnct05.log 0 1.246365e-01
## R.T.chees 0 1.178283e-01
## R.T.banh 0 1.176038e-01
## R.T.sum 0 1.156674e-01
## R.T.dim 0 1.107069e-01
## R.T.nice 0 1.036814e-01
## R.T.salad 0 1.021960e-01
## R.T.chines 0 1.003843e-01
## R.npnct09.log 0 9.433979e-02
## R.npnct03.log 0 9.397071e-02
## R.T.like 0 9.225243e-02
## R.T.roll 0 8.991836e-02
## R.T.chicken 0 8.650427e-02
## R.T.bar 0 8.600445e-02
## .rnorm 0 8.551826e-02
## R.npnct18.log 0 8.253867e-02
## R.ratio.sum.TfIdf.nwrds 0 7.780177e-02
## R.T.seattl 0 7.491085e-02
## R.npnct21.log 0 7.455775e-02
## R.T.vietnames 0 7.300890e-02
## R.T.cheap 0 7.286196e-02
## R.T.bbq 0 7.249209e-02
## R.T.park 0 7.237790e-02
## R.npnct10.log 0 7.146157e-02
## R.T.egg 0 7.006131e-02
## R.T.tofu 0 6.974134e-02
## R.T.best 0 6.739375e-02
## R.T.drink 0 6.633759e-02
## R.P.http 0 6.606126e-02
## R.npnct23.log 0 6.387801e-02
## R.npnct25.log 0 6.290420e-02
## R.T.falafel 0 6.231497e-02
## R.T.beer 0 6.199584e-02
## R.T.happi 0 6.195589e-02
## R.T.pizza 0 6.081540e-02
## R.npnct28.log 0 6.063391e-02
## R.T.just 0 6.035362e-02
## R.T.good 0 6.033749e-02
## R.T.deli 0 5.759853e-02
## R.T.alway 0 5.696007e-02
## R.T.soup 0 5.692613e-02
## R.T.realli 0 5.191088e-02
## R.T.cake 0 5.176378e-02
## R.npnct27.log 0 4.868565e-02
## R.npnct29.log 0 4.868565e-02
## R.T.ethiopian 0 4.833778e-02
## R.T.great 0 4.753125e-02
## R.T.sandwich 0 4.722423e-02
## R.T.burger 0 4.665500e-02
## R.npnct16.log 0 4.473265e-02
## R.T.get 0 4.418473e-02
## R.T.pretti 0 4.360764e-02
## R.T.friend 0 4.306420e-02
## R.npnct22.log 0 4.283529e-02
## R.npnct26.log 0 4.283529e-02
## R.T.fri 0 4.263357e-02
## R.T.delici 0 4.121077e-02
## R.T.mexican 0 3.930427e-02
## R.ratio.nstopwrds.nwrds 0 3.904108e-02
## R.T.philli 0 3.812319e-02
## R.T.price 0 3.811137e-02
## R.T.taco 0 3.785186e-02
## R.T.night 0 3.731382e-02
## R.T.coffe 0 3.731231e-02
## R.T.pho 0 3.680428e-02
## R.T.love 0 3.525386e-02
## R.T.order 0 3.449862e-02
## R.T.sandwhich 0 3.447413e-02
## R.T.food 0 3.403065e-02
## R.T.tast 0 3.250882e-02
## R.T.time 0 3.061799e-02
## R.T.hour 0 2.929357e-02
## R.T.restaur 0 2.703862e-02
## R.sum.TfIdf 0 2.553415e-02
## R.npnct06.log 0 2.485017e-02
## R.T.tri 0 2.382265e-02
## R.T.deliveri 0 2.103602e-02
## R.T.breakfast 0 1.673764e-02
## R.T.sauc 0 1.290473e-02
## R.T.sushi 0 1.210830e-02
## R.T.servic 0 1.182398e-02
## R.T.ive 0 1.057633e-02
## R.T.can 0 8.816060e-03
## R.npnct20.log 0 8.243086e-03
## R.T.menu 0 7.905538e-03
## R.T.one 0 7.172369e-03
## R.T.teriyaki 0 6.845139e-03
## R.T.fresh 0 3.537235e-03
## R.T.tabl 0 3.261586e-03
## R.T.place 0 2.975074e-03
## R.T.lunch 0 1.821254e-03
## R.P.year.colon 0 8.303515e-21
## R.npnct17.log 0 NA
## R.npnct24.log 0 NA
## R.P.daily.clip.report 0 NA
## R.P.fashion.week 0 NA
## R.P.first.draft 0 NA
# sav_feats_df <- glb_feats_df; glb_feats_df <- sav_feats_df
print(glb_feats_df <- orderBy(~-cor.y,
myfind_cor_features(feats_df=glb_feats_df, obs_df=glb_trnobs_df,
rsp_var=glb_rsp_var)))
## [1] "cor(R.nchrs.log, R.nwrds.log)=0.9996"
## [1] "cor(dirty.fctr, R.nchrs.log)=0.1586"
## [1] "cor(dirty.fctr, R.nwrds.log)=0.1611"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.nchrs.log as highly correlated with
## R.nwrds.log
## [1] "cor(R.nstopwrds.log, R.nwrds.log)=0.9985"
## [1] "cor(dirty.fctr, R.nstopwrds.log)=0.1577"
## [1] "cor(dirty.fctr, R.nwrds.log)=0.1611"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.nstopwrds.log as highly correlated with
## R.nwrds.log
## [1] "cor(R.nwrds.log, R.nwrds.unq.log)=0.9946"
## [1] "cor(dirty.fctr, R.nwrds.log)=0.1611"
## [1] "cor(dirty.fctr, R.nwrds.unq.log)=0.1587"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.nwrds.unq.log as highly correlated with
## R.nwrds.log
## [1] "cor(R.T.dim, R.T.sum)=0.9862"
## [1] "cor(dirty.fctr, R.T.dim)=0.1107"
## [1] "cor(dirty.fctr, R.T.sum)=0.1157"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.T.dim as highly correlated with R.T.sum
## [1] "cor(R.nuppr.log, R.nwrds.log)=0.9807"
## [1] "cor(dirty.fctr, R.nuppr.log)=0.1588"
## [1] "cor(dirty.fctr, R.nwrds.log)=0.1611"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.nuppr.log as highly correlated with
## R.nwrds.log
## [1] "cor(R.npnct13.log, R.nwrds.log)=0.9762"
## [1] "cor(dirty.fctr, R.npnct13.log)=0.1580"
## [1] "cor(dirty.fctr, R.nwrds.log)=0.1611"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct13.log as highly correlated with
## R.nwrds.log
## [1] "cor(R.npnct11.log, R.nwrds.log)=0.9614"
## [1] "cor(dirty.fctr, R.npnct11.log)=0.1528"
## [1] "cor(dirty.fctr, R.nwrds.log)=0.1611"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct11.log as highly correlated with
## R.nwrds.log
## [1] "cor(R.npnct07.log, R.nwrds.log)=0.9383"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## [1] "cor(dirty.fctr, R.nwrds.log)=0.1611"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.nwrds.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.npnct07.log, R.npnct08.log)=0.8700"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## [1] "cor(dirty.fctr, R.npnct08.log)=0.1641"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct08.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.npnct07.log, R.npnct12.log)=0.8642"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## [1] "cor(dirty.fctr, R.npnct12.log)=0.1522"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct12.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.ndgts.log, R.npnct07.log)=0.8358"
## [1] "cor(dirty.fctr, R.ndgts.log)=0.1827"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.ndgts.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.npnct07.log, review.niso8859.log)=0.8353"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## [1] "cor(dirty.fctr, review.niso8859.log)=0.1624"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified review.niso8859.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.npnct01.log, R.npnct07.log)=0.8087"
## [1] "cor(dirty.fctr, R.npnct01.log)=0.1631"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct01.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.npnct07.log, R.npnct15.log)=0.7999"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## [1] "cor(dirty.fctr, R.npnct15.log)=0.1569"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct15.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.npnct02.log, R.npnct07.log)=0.7681"
## [1] "cor(dirty.fctr, R.npnct02.log)=0.1395"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct02.log as highly correlated with
## R.npnct07.log
## [1] "cor(R.npnct07.log, R.npnct19.log)=0.7600"
## [1] "cor(dirty.fctr, R.npnct07.log)=0.1881"
## [1] "cor(dirty.fctr, R.npnct19.log)=0.2245"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified R.npnct07.log as highly correlated with
## R.npnct19.log
## id cor.y exclude.as.feat cor.y.abs
## 2 dirty 1.000000e+00 1 1.000000e+00
## 23 R.npnct19.log 2.245129e-01 0 2.245129e-01
## 66 R.T.dish 1.979787e-01 0 1.979787e-01
## 11 R.npnct07.log 1.880709e-01 0 1.880709e-01
## 4 R.ndgts.log 1.827317e-01 0 1.827317e-01
## 97 R.T.pork 1.801002e-01 0 1.801002e-01
## 89 R.T.noodl 1.650966e-01 0 1.650966e-01
## 12 R.npnct08.log 1.640528e-01 0 1.640528e-01
## 5 R.npnct01.log 1.631087e-01 0 1.631087e-01
## 122 review.niso8859.log 1.624341e-01 0 1.624341e-01
## 37 R.nwrds.log 1.610645e-01 0 1.610645e-01
## 36 R.nuppr.log 1.587833e-01 0 1.587833e-01
## 38 R.nwrds.unq.log 1.586825e-01 0 1.586825e-01
## 3 R.nchrs.log 1.586282e-01 0 1.586282e-01
## 17 R.npnct13.log 1.580031e-01 0 1.580031e-01
## 35 R.nstopwrds.log 1.576995e-01 0 1.576995e-01
## 19 R.npnct15.log 1.568546e-01 0 1.568546e-01
## 15 R.npnct11.log 1.527621e-01 0 1.527621e-01
## 16 R.npnct12.log 1.522268e-01 0 1.522268e-01
## 18 R.npnct14.log 1.515787e-01 0 1.515787e-01
## 8 R.npnct04.log 1.474628e-01 0 1.474628e-01
## 6 R.npnct02.log 1.394844e-01 0 1.394844e-01
## 102 R.T.rice 1.331662e-01 0 1.331662e-01
## 117 R.T.thai 1.273370e-01 0 1.273370e-01
## 34 R.npnct30.log 1.269623e-01 0 1.269623e-01
## 9 R.npnct05.log 1.246365e-01 0 1.246365e-01
## 48 R.T.banh 1.176038e-01 0 1.176038e-01
## 111 R.T.sum 1.156674e-01 0 1.156674e-01
## 65 R.T.dim 1.107069e-01 0 1.107069e-01
## 60 R.T.chines 1.003843e-01 0 1.003843e-01
## 13 R.npnct09.log 9.433979e-02 0 9.433979e-02
## 7 R.npnct03.log 9.397071e-02 0 9.397071e-02
## 103 R.T.roll 8.991836e-02 0 8.991836e-02
## 59 R.T.chicken 8.650427e-02 0 8.650427e-02
## 1 .rnorm 8.551826e-02 0 8.551826e-02
## 22 R.npnct18.log 8.253867e-02 0 8.253867e-02
## 108 R.T.seattl 7.491085e-02 0 7.491085e-02
## 25 R.npnct21.log 7.455775e-02 0 7.455775e-02
## 121 R.T.vietnames 7.300890e-02 0 7.300890e-02
## 57 R.T.cheap 7.286196e-02 0 7.286196e-02
## 50 R.T.bbq 7.249209e-02 0 7.249209e-02
## 92 R.T.park 7.237790e-02 0 7.237790e-02
## 14 R.npnct10.log 7.146157e-02 0 7.146157e-02
## 68 R.T.egg 7.006131e-02 0 7.006131e-02
## 119 R.T.tofu 6.974134e-02 0 6.974134e-02
## 52 R.T.best 6.739375e-02 0 6.739375e-02
## 42 R.P.http 6.606126e-02 0 6.606126e-02
## 27 R.npnct23.log 6.387801e-02 0 6.387801e-02
## 29 R.npnct25.log 6.290420e-02 0 6.290420e-02
## 70 R.T.falafel 6.231497e-02 0 6.231497e-02
## 32 R.npnct28.log 6.063391e-02 0 6.063391e-02
## 62 R.T.deli 5.759853e-02 0 5.759853e-02
## 110 R.T.soup 5.692613e-02 0 5.692613e-02
## 31 R.npnct27.log 4.868565e-02 0 4.868565e-02
## 33 R.npnct29.log 4.868565e-02 0 4.868565e-02
## 77 R.T.great 4.753125e-02 0 4.753125e-02
## 20 R.npnct16.log 4.473265e-02 0 4.473265e-02
## 75 R.T.get 4.418473e-02 0 4.418473e-02
## 26 R.npnct22.log 4.283529e-02 0 4.283529e-02
## 30 R.npnct26.log 4.283529e-02 0 4.283529e-02
## 73 R.T.fri 4.263357e-02 0 4.263357e-02
## 93 R.T.philli 3.812319e-02 0 3.812319e-02
## 94 R.T.pho 3.680428e-02 0 3.680428e-02
## 83 R.T.love 3.525386e-02 0 3.525386e-02
## 71 R.T.food 3.403065e-02 0 3.403065e-02
## 118 R.T.time 3.061799e-02 0 3.061799e-02
## 101 R.T.restaur 2.703862e-02 0 2.703862e-02
## 10 R.npnct06.log 2.485017e-02 0 2.485017e-02
## 120 R.T.tri 2.382265e-02 0 2.382265e-02
## 107 R.T.sauc 1.290473e-02 0 1.290473e-02
## 112 R.T.sushi 1.210830e-02 0 1.210830e-02
## 109 R.T.servic 1.182398e-02 0 1.182398e-02
## 80 R.T.ive 1.057633e-02 0 1.057633e-02
## 56 R.T.can 8.816060e-03 0 8.816060e-03
## 24 R.npnct20.log 8.243086e-03 0 8.243086e-03
## 85 R.T.menu 7.905538e-03 0 7.905538e-03
## 116 R.T.teriyaki 6.845139e-03 0 6.845139e-03
## 84 R.T.lunch 1.821254e-03 0 1.821254e-03
## 43 R.P.year.colon -8.303515e-21 0 8.303515e-21
## 96 R.T.place -2.975074e-03 0 2.975074e-03
## 113 R.T.tabl -3.261586e-03 0 3.261586e-03
## 72 R.T.fresh -3.537235e-03 0 3.537235e-03
## 90 R.T.one -7.172369e-03 0 7.172369e-03
## 53 R.T.breakfast -1.673764e-02 0 1.673764e-02
## 64 R.T.deliveri -2.103602e-02 0 2.103602e-02
## 46 R.sum.TfIdf -2.553415e-02 0 2.553415e-02
## 79 R.T.hour -2.929357e-02 0 2.929357e-02
## 115 R.T.tast -3.250882e-02 0 3.250882e-02
## 105 R.T.sandwhich -3.447413e-02 0 3.447413e-02
## 91 R.T.order -3.449862e-02 0 3.449862e-02
## 61 R.T.coffe -3.731231e-02 0 3.731231e-02
## 88 R.T.night -3.731382e-02 0 3.731382e-02
## 114 R.T.taco -3.785186e-02 0 3.785186e-02
## 99 R.T.price -3.811137e-02 0 3.811137e-02
## 44 R.ratio.nstopwrds.nwrds -3.904108e-02 0 3.904108e-02
## 86 R.T.mexican -3.930427e-02 0 3.930427e-02
## 63 R.T.delici -4.121077e-02 0 4.121077e-02
## 74 R.T.friend -4.306420e-02 0 4.306420e-02
## 98 R.T.pretti -4.360764e-02 0 4.360764e-02
## 54 R.T.burger -4.665500e-02 0 4.665500e-02
## 106 R.T.sandwich -4.722423e-02 0 4.722423e-02
## 69 R.T.ethiopian -4.833778e-02 0 4.833778e-02
## 55 R.T.cake -5.176378e-02 0 5.176378e-02
## 100 R.T.realli -5.191088e-02 0 5.191088e-02
## 47 R.T.alway -5.696007e-02 0 5.696007e-02
## 76 R.T.good -6.033749e-02 0 6.033749e-02
## 81 R.T.just -6.035362e-02 0 6.035362e-02
## 95 R.T.pizza -6.081540e-02 0 6.081540e-02
## 78 R.T.happi -6.195589e-02 0 6.195589e-02
## 51 R.T.beer -6.199584e-02 0 6.199584e-02
## 67 R.T.drink -6.633759e-02 0 6.633759e-02
## 45 R.ratio.sum.TfIdf.nwrds -7.780177e-02 0 7.780177e-02
## 49 R.T.bar -8.600445e-02 0 8.600445e-02
## 82 R.T.like -9.225243e-02 0 9.225243e-02
## 104 R.T.salad -1.021960e-01 0 1.021960e-01
## 87 R.T.nice -1.036814e-01 0 1.036814e-01
## 58 R.T.chees -1.178283e-01 0 1.178283e-01
## 21 R.npnct17.log NA 0 NA
## 28 R.npnct24.log NA 0 NA
## 39 R.P.daily.clip.report NA 0 NA
## 40 R.P.fashion.week NA 0 NA
## 41 R.P.first.draft NA 0 NA
## cor.high.X freqRatio percentUnique zeroVar nzv myNearZV
## 2 <NA> 1.000000 0.3663004 FALSE FALSE FALSE
## 23 <NA> 1.760417 5.1282051 FALSE FALSE FALSE
## 66 <NA> 90.000000 49.0842491 FALSE FALSE FALSE
## 11 R.npnct19.log 1.000000 23.6263736 FALSE FALSE FALSE
## 4 R.npnct07.log 2.312500 18.1318681 FALSE FALSE FALSE
## 97 <NA> 191.500000 29.8534799 FALSE FALSE FALSE
## 89 <NA> 206.500000 23.9926740 FALSE FALSE FALSE
## 12 R.npnct07.log 1.322581 15.9340659 FALSE FALSE FALSE
## 5 R.npnct07.log 1.645833 14.1025641 FALSE FALSE FALSE
## 122 R.npnct07.log 3.739130 20.8791209 FALSE FALSE FALSE
## 37 R.npnct07.log 1.000000 89.7435897 FALSE FALSE FALSE
## 36 R.nwrds.log 1.000000 56.2271062 FALSE FALSE FALSE
## 38 R.nwrds.log 1.000000 75.2747253 FALSE FALSE FALSE
## 3 R.nwrds.log 1.000000 98.3516484 FALSE FALSE FALSE
## 17 R.nwrds.log 1.000000 43.9560440 FALSE FALSE FALSE
## 35 R.nwrds.log 1.000000 79.1208791 FALSE FALSE FALSE
## 19 R.npnct07.log 2.129412 5.6776557 FALSE FALSE FALSE
## 15 R.nwrds.log 1.176471 32.7838828 FALSE FALSE FALSE
## 16 R.npnct07.log 1.296296 13.7362637 FALSE FALSE FALSE
## 18 <NA> 2.461538 4.3956044 FALSE FALSE FALSE
## 8 <NA> 2.433333 4.9450549 FALSE FALSE FALSE
## 6 R.npnct07.log 2.388235 7.6923077 FALSE FALSE FALSE
## 102 <NA> 115.333333 35.3479853 FALSE FALSE FALSE
## 117 <NA> 239.000000 12.2710623 FALSE FALSE FALSE
## 34 <NA> 15.656250 0.7326007 FALSE FALSE FALSE
## 9 <NA> 10.191489 1.0989011 FALSE FALSE FALSE
## 48 <NA> 520.000000 4.9450549 FALSE TRUE TRUE
## 111 <NA> 498.000000 8.9743590 FALSE TRUE TRUE
## 65 R.T.sum 490.000000 10.4395604 FALSE FALSE FALSE
## 60 <NA> 466.000000 14.8351648 FALSE FALSE FALSE
## 13 <NA> 8.078431 3.4798535 FALSE FALSE FALSE
## 7 <NA> 8.410714 1.4652015 FALSE FALSE FALSE
## 103 <NA> 192.000000 29.3040293 FALSE FALSE FALSE
## 59 <NA> 126.000000 52.0146520 FALSE FALSE FALSE
## 1 <NA> 1.000000 100.0000000 FALSE FALSE FALSE
## 22 <NA> 9.645833 2.0146520 FALSE FALSE FALSE
## 108 <NA> 79.666667 53.2967033 FALSE FALSE FALSE
## 25 <NA> 27.421053 0.7326007 FALSE TRUE FALSE
## 121 <NA> 488.000000 10.8058608 FALSE FALSE FALSE
## 57 <NA> 115.666667 35.7142857 FALSE FALSE FALSE
## 50 <NA> 236.000000 13.5531136 FALSE FALSE FALSE
## 92 <NA> 192.000000 29.3040293 FALSE FALSE FALSE
## 14 <NA> 7.500000 1.4652015 FALSE FALSE FALSE
## 68 <NA> 201.000000 26.3736264 FALSE FALSE FALSE
## 119 <NA> 445.000000 18.6813187 FALSE FALSE FALSE
## 52 <NA> 67.666667 59.3406593 FALSE FALSE FALSE
## 42 <NA> 13.184211 0.9157509 FALSE FALSE FALSE
## 27 <NA> 24.761905 0.7326007 FALSE TRUE FALSE
## 29 <NA> 179.000000 0.9157509 FALSE TRUE TRUE
## 70 <NA> 528.000000 3.4798535 FALSE TRUE TRUE
## 32 <NA> 272.000000 0.3663004 FALSE TRUE TRUE
## 62 <NA> 516.000000 5.6776557 FALSE TRUE TRUE
## 110 <NA> 180.500000 33.5164835 FALSE FALSE FALSE
## 31 <NA> 180.666667 0.5494505 FALSE TRUE TRUE
## 33 <NA> 180.666667 0.5494505 FALSE TRUE TRUE
## 77 <NA> 48.666667 67.5824176 FALSE FALSE FALSE
## 20 <NA> 2.652632 4.7619048 FALSE FALSE FALSE
## 75 <NA> 27.750000 72.7106227 FALSE FALSE FALSE
## 26 <NA> 545.000000 0.3663004 FALSE TRUE TRUE
## 30 <NA> 545.000000 0.3663004 FALSE TRUE TRUE
## 73 <NA> 156.000000 41.7582418 FALSE FALSE FALSE
## 93 <NA> 541.000000 1.0989011 FALSE TRUE TRUE
## 94 <NA> 243.000000 10.6227106 FALSE FALSE FALSE
## 83 <NA> 62.333333 61.7216117 FALSE FALSE FALSE
## 71 <NA> 27.000000 77.4725275 FALSE FALSE FALSE
## 118 <NA> 41.333333 70.1465201 FALSE FALSE FALSE
## 101 <NA> 73.333333 57.5091575 FALSE FALSE FALSE
## 10 <NA> 3.818182 3.8461538 FALSE FALSE FALSE
## 120 <NA> 56.000000 64.2857143 FALSE FALSE FALSE
## 107 <NA> 132.000000 49.8168498 FALSE FALSE FALSE
## 112 <NA> 514.000000 6.0439560 FALSE TRUE TRUE
## 109 <NA> 56.333333 64.2857143 FALSE FALSE FALSE
## 80 <NA> 61.666667 61.7216117 FALSE FALSE FALSE
## 56 <NA> 83.500000 64.4688645 FALSE FALSE FALSE
## 24 <NA> 27.578947 0.7326007 FALSE TRUE FALSE
## 85 <NA> 74.666667 56.5934066 FALSE FALSE FALSE
## 116 <NA> 503.000000 8.0586081 FALSE TRUE TRUE
## 84 <NA> 147.000000 44.6886447 FALSE FALSE FALSE
## 43 <NA> 90.000000 0.3663004 FALSE TRUE FALSE
## 96 <NA> 14.200000 77.1062271 FALSE FALSE FALSE
## 113 <NA> 90.666667 48.3516484 FALSE FALSE FALSE
## 72 <NA> 65.250000 48.5347985 FALSE FALSE FALSE
## 90 <NA> 32.250000 67.3992674 FALSE FALSE FALSE
## 53 <NA> 231.000000 15.3846154 FALSE FALSE FALSE
## 64 <NA> 505.000000 7.6923077 FALSE TRUE TRUE
## 46 <NA> 1.000000 100.0000000 FALSE FALSE FALSE
## 79 <NA> 156.000000 41.3919414 FALSE FALSE FALSE
## 115 <NA> 119.000000 53.6630037 FALSE FALSE FALSE
## 105 <NA> 530.000000 3.1135531 FALSE TRUE TRUE
## 91 <NA> 43.000000 71.0622711 FALSE FALSE FALSE
## 61 <NA> 227.000000 16.8498168 FALSE FALSE FALSE
## 88 <NA> 158.500000 41.0256410 FALSE FALSE FALSE
## 114 <NA> 492.000000 10.0732601 FALSE FALSE FALSE
## 99 <NA> 62.666667 60.4395604 FALSE FALSE FALSE
## 44 <NA> 1.333333 97.4358974 FALSE FALSE FALSE
## 86 <NA> 248.000000 9.1575092 FALSE TRUE TRUE
## 63 <NA> 84.000000 51.4652015 FALSE FALSE FALSE
## 74 <NA> 55.333333 63.9194139 FALSE FALSE FALSE
## 98 <NA> 65.000000 60.8058608 FALSE FALSE FALSE
## 54 <NA> 465.000000 15.0183150 FALSE FALSE FALSE
## 106 <NA> 197.000000 27.6556777 FALSE FALSE FALSE
## 69 <NA> 540.000000 1.2820513 FALSE TRUE TRUE
## 55 <NA> 467.000000 14.6520147 FALSE FALSE FALSE
## 100 <NA> 48.000000 69.5970696 FALSE FALSE FALSE
## 47 <NA> 104.500000 58.2417582 FALSE FALSE FALSE
## 76 <NA> 15.000000 78.7545788 FALSE FALSE FALSE
## 81 <NA> 29.000000 70.5128205 FALSE FALSE FALSE
## 95 <NA> 239.000000 12.2710623 FALSE FALSE FALSE
## 78 <NA> 104.333333 42.1245421 FALSE FALSE FALSE
## 51 <NA> 209.000000 23.4432234 FALSE FALSE FALSE
## 67 <NA> 152.500000 43.5897436 FALSE FALSE FALSE
## 45 <NA> 1.000000 100.0000000 FALSE FALSE FALSE
## 49 <NA> 341.000000 37.7289377 FALSE FALSE FALSE
## 82 <NA> 23.500000 74.1758242 FALSE FALSE FALSE
## 104 <NA> 170.500000 36.9963370 FALSE FALSE FALSE
## 87 <NA> 97.500000 61.5384615 FALSE FALSE FALSE
## 58 <NA> 185.000000 31.6849817 FALSE FALSE FALSE
## 21 <NA> 0.000000 0.1831502 TRUE TRUE TRUE
## 28 <NA> 0.000000 0.1831502 TRUE TRUE TRUE
## 39 <NA> 0.000000 0.1831502 TRUE TRUE TRUE
## 40 <NA> 0.000000 0.1831502 TRUE TRUE TRUE
## 41 <NA> 0.000000 0.1831502 TRUE TRUE TRUE
## is.cor.y.abs.low
## 2 FALSE
## 23 FALSE
## 66 FALSE
## 11 FALSE
## 4 FALSE
## 97 FALSE
## 89 FALSE
## 12 FALSE
## 5 FALSE
## 122 FALSE
## 37 FALSE
## 36 FALSE
## 38 FALSE
## 3 FALSE
## 17 FALSE
## 35 FALSE
## 19 FALSE
## 15 FALSE
## 16 FALSE
## 18 FALSE
## 8 FALSE
## 6 FALSE
## 102 FALSE
## 117 FALSE
## 34 FALSE
## 9 FALSE
## 48 FALSE
## 111 FALSE
## 65 FALSE
## 60 FALSE
## 13 FALSE
## 7 FALSE
## 103 FALSE
## 59 FALSE
## 1 FALSE
## 22 TRUE
## 108 TRUE
## 25 TRUE
## 121 TRUE
## 57 TRUE
## 50 TRUE
## 92 TRUE
## 14 TRUE
## 68 TRUE
## 119 TRUE
## 52 TRUE
## 42 TRUE
## 27 TRUE
## 29 TRUE
## 70 TRUE
## 32 TRUE
## 62 TRUE
## 110 TRUE
## 31 TRUE
## 33 TRUE
## 77 TRUE
## 20 TRUE
## 75 TRUE
## 26 TRUE
## 30 TRUE
## 73 TRUE
## 93 TRUE
## 94 TRUE
## 83 TRUE
## 71 TRUE
## 118 TRUE
## 101 TRUE
## 10 TRUE
## 120 TRUE
## 107 TRUE
## 112 TRUE
## 109 TRUE
## 80 TRUE
## 56 TRUE
## 24 TRUE
## 85 TRUE
## 116 TRUE
## 84 TRUE
## 43 TRUE
## 96 TRUE
## 113 TRUE
## 72 TRUE
## 90 TRUE
## 53 TRUE
## 64 TRUE
## 46 TRUE
## 79 TRUE
## 115 TRUE
## 105 TRUE
## 91 TRUE
## 61 TRUE
## 88 TRUE
## 114 TRUE
## 99 TRUE
## 44 TRUE
## 86 TRUE
## 63 TRUE
## 74 TRUE
## 98 TRUE
## 54 TRUE
## 106 TRUE
## 69 TRUE
## 55 TRUE
## 100 TRUE
## 47 TRUE
## 76 TRUE
## 81 TRUE
## 95 TRUE
## 78 TRUE
## 51 TRUE
## 67 TRUE
## 45 TRUE
## 49 FALSE
## 82 FALSE
## 104 FALSE
## 87 FALSE
## 58 FALSE
## 21 NA
## 28 NA
## 39 NA
## 40 NA
## 41 NA
#subset(glb_feats_df, id %in% c("A.nuppr.log", "S.nuppr.log"))
print(myplot_scatter(glb_feats_df, "percentUnique", "freqRatio",
colorcol_name="myNearZV", jitter=TRUE) +
geom_point(aes(shape=nzv)) + xlim(-5, 25))
## Warning in myplot_scatter(glb_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "myNearZV", : converting myNearZV to class:factor
## Warning in loop_apply(n, do.ply): Removed 60 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 60 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 60 rows containing missing values
## (geom_point).
print(subset(glb_feats_df, myNearZV))
## id cor.y exclude.as.feat cor.y.abs
## 48 R.T.banh 0.117603842 0 0.117603842
## 111 R.T.sum 0.115667438 0 0.115667438
## 29 R.npnct25.log 0.062904203 0 0.062904203
## 70 R.T.falafel 0.062314968 0 0.062314968
## 32 R.npnct28.log 0.060633906 0 0.060633906
## 62 R.T.deli 0.057598530 0 0.057598530
## 31 R.npnct27.log 0.048685648 0 0.048685648
## 33 R.npnct29.log 0.048685648 0 0.048685648
## 26 R.npnct22.log 0.042835294 0 0.042835294
## 30 R.npnct26.log 0.042835294 0 0.042835294
## 93 R.T.philli 0.038123187 0 0.038123187
## 112 R.T.sushi 0.012108295 0 0.012108295
## 116 R.T.teriyaki 0.006845139 0 0.006845139
## 64 R.T.deliveri -0.021036018 0 0.021036018
## 105 R.T.sandwhich -0.034474131 0 0.034474131
## 86 R.T.mexican -0.039304267 0 0.039304267
## 69 R.T.ethiopian -0.048337785 0 0.048337785
## 21 R.npnct17.log NA 0 NA
## 28 R.npnct24.log NA 0 NA
## 39 R.P.daily.clip.report NA 0 NA
## 40 R.P.fashion.week NA 0 NA
## 41 R.P.first.draft NA 0 NA
## cor.high.X freqRatio percentUnique zeroVar nzv myNearZV
## 48 <NA> 520.0000 4.9450549 FALSE TRUE TRUE
## 111 <NA> 498.0000 8.9743590 FALSE TRUE TRUE
## 29 <NA> 179.0000 0.9157509 FALSE TRUE TRUE
## 70 <NA> 528.0000 3.4798535 FALSE TRUE TRUE
## 32 <NA> 272.0000 0.3663004 FALSE TRUE TRUE
## 62 <NA> 516.0000 5.6776557 FALSE TRUE TRUE
## 31 <NA> 180.6667 0.5494505 FALSE TRUE TRUE
## 33 <NA> 180.6667 0.5494505 FALSE TRUE TRUE
## 26 <NA> 545.0000 0.3663004 FALSE TRUE TRUE
## 30 <NA> 545.0000 0.3663004 FALSE TRUE TRUE
## 93 <NA> 541.0000 1.0989011 FALSE TRUE TRUE
## 112 <NA> 514.0000 6.0439560 FALSE TRUE TRUE
## 116 <NA> 503.0000 8.0586081 FALSE TRUE TRUE
## 64 <NA> 505.0000 7.6923077 FALSE TRUE TRUE
## 105 <NA> 530.0000 3.1135531 FALSE TRUE TRUE
## 86 <NA> 248.0000 9.1575092 FALSE TRUE TRUE
## 69 <NA> 540.0000 1.2820513 FALSE TRUE TRUE
## 21 <NA> 0.0000 0.1831502 TRUE TRUE TRUE
## 28 <NA> 0.0000 0.1831502 TRUE TRUE TRUE
## 39 <NA> 0.0000 0.1831502 TRUE TRUE TRUE
## 40 <NA> 0.0000 0.1831502 TRUE TRUE TRUE
## 41 <NA> 0.0000 0.1831502 TRUE TRUE TRUE
## is.cor.y.abs.low
## 48 FALSE
## 111 FALSE
## 29 TRUE
## 70 TRUE
## 32 TRUE
## 62 TRUE
## 31 TRUE
## 33 TRUE
## 26 TRUE
## 30 TRUE
## 93 TRUE
## 112 TRUE
## 116 TRUE
## 64 TRUE
## 105 TRUE
## 86 TRUE
## 69 TRUE
## 21 NA
## 28 NA
## 39 NA
## 40 NA
## 41 NA
glb_allobs_df <- glb_allobs_df[, setdiff(names(glb_allobs_df),
subset(glb_feats_df, myNearZV)$id)]
if (!is.null(glb_interaction_only_features))
glb_feats_df[glb_feats_df$id %in% glb_interaction_only_features, "interaction.feat"] <-
names(glb_interaction_only_features) else
glb_feats_df$interaction.feat <- NA
mycheck_problem_data(glb_allobs_df, terminate = TRUE)
## [1] "numeric data missing in : "
## dirty dirty.fctr
## 200 200
## [1] "numeric data w/ 0s in : "
## dirty R.T.food R.T.friend R.T.get R.T.good
## 273 107 222 152 92
## R.T.great R.T.just R.T.like R.T.one R.T.order
## 199 160 132 173 178
## R.T.place R.T.realli R.T.time R.T.alway R.T.bar
## 93 204 172 292 464
## R.T.bbq R.T.beer R.T.best R.T.breakfast R.T.burger
## 648 568 272 641 636
## R.T.cake R.T.can R.T.cheap R.T.chees R.T.chicken
## 630 225 469 513 343
## R.T.chines R.T.coffe R.T.delici R.T.dim R.T.dish
## 632 617 334 674 373
## R.T.drink R.T.egg R.T.fresh R.T.fri R.T.happi
## 418 544 357 420 441
## R.T.hour R.T.ive R.T.love R.T.lunch R.T.menu
## 433 247 252 402 318
## R.T.nice R.T.night R.T.noodl R.T.park R.T.pho
## 274 430 574 516 662
## R.T.pizza R.T.pork R.T.pretti R.T.price R.T.restaur
## 655 520 265 261 308
## R.T.rice R.T.roll R.T.salad R.T.sandwich R.T.sauc
## 472 518 466 542 366
## R.T.seattl R.T.servic R.T.soup R.T.tabl R.T.taco
## 320 232 492 369 677
## R.T.tast R.T.thai R.T.tofu R.T.tri R.T.vietnames
## 333 648 595 231 661
## R.nuppr.log R.ndgts.log R.npnct01.log R.npnct02.log R.npnct03.log
## 1 104 119 288 644
## R.npnct04.log R.npnct05.log R.npnct06.log R.npnct07.log R.npnct08.log
## 290 644 457 20 110
## R.npnct09.log R.npnct10.log R.npnct11.log R.npnct12.log R.npnct14.log
## 569 594 20 96 296
## R.npnct15.log R.npnct16.log R.npnct18.log R.npnct19.log R.npnct20.log
## 251 343 630 245 714
## R.npnct21.log R.npnct23.log R.npnct30.log R.P.http R.P.year.colon
## 719 718 688 692 739
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## review .rownames review.my
## 0 0 0
# glb_allobs_df %>% filter(is.na(Married.fctr)) %>% tbl_df()
# glb_allobs_df %>% count(Married.fctr)
# levels(glb_allobs_df$Married.fctr)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "partition.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 8 select.features 5 0 130.484 131.703 1.219
## 9 partition.data.training 6 0 131.703 NA NA
6.0: partition data trainingif (all(is.na(glb_newobs_df[, glb_rsp_var]))) {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=1 - (nrow(glb_newobs_df) * 1.1 / nrow(glb_trnobs_df)))
glb_fitobs_df <- glb_trnobs_df[split, ]
glb_OOBobs_df <- glb_trnobs_df[!split ,]
} else {
print(sprintf("Newdata contains non-NA data for %s; setting OOB to Newdata",
glb_rsp_var))
glb_fitobs_df <- glb_trnobs_df; glb_OOBobs_df <- glb_newobs_df
}
## Loading required package: caTools
if (!is.null(glb_max_fitobs) && (nrow(glb_fitobs_df) > glb_max_fitobs)) {
warning("glb_fitobs_df restricted to glb_max_fitobs: ",
format(glb_max_fitobs, big.mark=","))
org_fitobs_df <- glb_fitobs_df
glb_fitobs_df <-
org_fitobs_df[split <- sample.split(org_fitobs_df[, glb_rsp_var_raw],
SplitRatio=glb_max_fitobs), ]
org_fitobs_df <- NULL
}
glb_allobs_df$.lcn <- ""
glb_allobs_df[glb_allobs_df[, glb_id_var] %in%
glb_fitobs_df[, glb_id_var], ".lcn"] <- "Fit"
glb_allobs_df[glb_allobs_df[, glb_id_var] %in%
glb_OOBobs_df[, glb_id_var], ".lcn"] <- "OOB"
dsp_class_dstrb <- function(obs_df, location_var, partition_var) {
xtab_df <- mycreate_xtab_df(obs_df, c(location_var, partition_var))
rownames(xtab_df) <- xtab_df[, location_var]
xtab_df <- xtab_df[, -grepl(location_var, names(xtab_df))]
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Ensure proper splits by glb_rsp_var_raw & user-specified feature for OOB vs. new
if (!is.null(glb_category_vars)) {
if (glb_is_classification)
dsp_class_dstrb(glb_allobs_df, ".lcn", glb_rsp_var_raw)
newobs_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .src == "Test"),
glb_category_vars)
OOBobs_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .lcn == "OOB"),
glb_category_vars)
glb_ctgry_df <- merge(newobs_ctgry_df, OOBobs_ctgry_df, by=glb_category_vars
, all=TRUE, suffixes=c(".Tst", ".OOB"))
glb_ctgry_df$.freqRatio.Tst <- glb_ctgry_df$.n.Tst / sum(glb_ctgry_df$.n.Tst, na.rm=TRUE)
glb_ctgry_df$.freqRatio.OOB <- glb_ctgry_df$.n.OOB / sum(glb_ctgry_df$.n.OOB, na.rm=TRUE)
print(orderBy(~-.freqRatio.Tst-.freqRatio.OOB, glb_ctgry_df))
}
# Run this line by line
print("glb_feats_df:"); print(dim(glb_feats_df))
## [1] "glb_feats_df:"
## [1] 122 12
sav_feats_df <- glb_feats_df
glb_feats_df <- sav_feats_df
glb_feats_df[, "rsp_var_raw"] <- FALSE
glb_feats_df[glb_feats_df$id == glb_rsp_var_raw, "rsp_var_raw"] <- TRUE
glb_feats_df$exclude.as.feat <- (glb_feats_df$exclude.as.feat == 1)
if (!is.null(glb_id_var) && glb_id_var != ".rownames")
glb_feats_df[glb_feats_df$id %in% glb_id_var, "id_var"] <- TRUE
add_feats_df <- data.frame(id=glb_rsp_var, exclude.as.feat=TRUE, rsp_var=TRUE)
row.names(add_feats_df) <- add_feats_df$id; print(add_feats_df)
## id exclude.as.feat rsp_var
## dirty.fctr dirty.fctr TRUE TRUE
glb_feats_df <- myrbind_df(glb_feats_df, add_feats_df)
if (glb_id_var != ".rownames")
print(subset(glb_feats_df, rsp_var_raw | rsp_var | id_var)) else
print(subset(glb_feats_df, rsp_var_raw | rsp_var))
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## 2 dirty 1 TRUE 1 <NA> 1
## dirty.fctr dirty.fctr NA TRUE NA <NA> NA
## percentUnique zeroVar nzv myNearZV is.cor.y.abs.low
## 2 0.3663004 FALSE FALSE FALSE FALSE
## dirty.fctr NA NA NA NA NA
## interaction.feat rsp_var_raw rsp_var
## 2 NA TRUE NA
## dirty.fctr NA NA TRUE
print("glb_feats_df vs. glb_allobs_df: ");
## [1] "glb_feats_df vs. glb_allobs_df: "
print(setdiff(glb_feats_df$id, names(glb_allobs_df)))
## [1] "R.T.banh" "R.T.sum"
## [3] "R.npnct25.log" "R.T.falafel"
## [5] "R.npnct28.log" "R.T.deli"
## [7] "R.npnct27.log" "R.npnct29.log"
## [9] "R.npnct22.log" "R.npnct26.log"
## [11] "R.T.philli" "R.T.sushi"
## [13] "R.T.teriyaki" "R.T.deliveri"
## [15] "R.T.sandwhich" "R.T.mexican"
## [17] "R.T.ethiopian" "R.npnct17.log"
## [19] "R.npnct24.log" "R.P.daily.clip.report"
## [21] "R.P.fashion.week" "R.P.first.draft"
print("glb_allobs_df vs. glb_feats_df: ");
## [1] "glb_allobs_df vs. glb_feats_df: "
# Ensure these are only chr vars
print(setdiff(setdiff(names(glb_allobs_df), glb_feats_df$id),
myfind_chr_cols_df(glb_allobs_df)))
## character(0)
#print(setdiff(setdiff(names(glb_allobs_df), glb_exclude_vars_as_features),
# glb_feats_df$id))
print("glb_allobs_df: "); print(dim(glb_allobs_df))
## [1] "glb_allobs_df: "
## [1] 746 106
print("glb_trnobs_df: "); print(dim(glb_trnobs_df))
## [1] "glb_trnobs_df: "
## [1] 546 127
print("glb_fitobs_df: "); print(dim(glb_fitobs_df))
## [1] "glb_fitobs_df: "
## [1] 326 127
print("glb_OOBobs_df: "); print(dim(glb_OOBobs_df))
## [1] "glb_OOBobs_df: "
## [1] 220 127
print("glb_newobs_df: "); print(dim(glb_newobs_df))
## [1] "glb_newobs_df: "
## [1] 200 127
# # Does not handle NULL or length(glb_id_var) > 1
# glb_allobs_df$.src.trn <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_trnobs_df[, glb_id_var],
# ".src.trn"] <- 1
# glb_allobs_df$.src.fit <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_fitobs_df[, glb_id_var],
# ".src.fit"] <- 1
# glb_allobs_df$.src.OOB <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_OOBobs_df[, glb_id_var],
# ".src.OOB"] <- 1
# glb_allobs_df$.src.new <- 0
# glb_allobs_df[glb_allobs_df[, glb_id_var] %in% glb_newobs_df[, glb_id_var],
# ".src.new"] <- 1
# #print(unique(glb_allobs_df[, ".src.trn"]))
# write_cols <- c(glb_feats_df$id,
# ".src.trn", ".src.fit", ".src.OOB", ".src.new")
# glb_allobs_df <- glb_allobs_df[, write_cols]
#
# tmp_feats_df <- glb_feats_df
# tmp_entity_df <- glb_allobs_df
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
file=paste0(glb_out_pfx, "blddfs_dsk.RData"))
# load(paste0(glb_out_pfx, "blddfs_dsk.RData"))
# if (!all.equal(tmp_feats_df, glb_feats_df))
# stop("glb_feats_df r/w not working")
# if (!all.equal(tmp_entity_df, glb_allobs_df))
# stop("glb_allobs_df r/w not working")
rm(split)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 9 partition.data.training 6 0 131.703 132.228 0.525
## 10 fit.models 7 0 132.229 NA NA
7.0: fit models# load(paste0(glb_out_pfx, "dsk.RData"))
# keep_cols <- setdiff(names(glb_allobs_df),
# grep("^.src", names(glb_allobs_df), value=TRUE))
# glb_trnobs_df <- glb_allobs_df[glb_allobs_df$.src.trn == 1, keep_cols]
# glb_fitobs_df <- glb_allobs_df[glb_allobs_df$.src.fit == 1, keep_cols]
# glb_OOBobs_df <- glb_allobs_df[glb_allobs_df$.src.OOB == 1, keep_cols]
# glb_newobs_df <- glb_allobs_df[glb_allobs_df$.src.new == 1, keep_cols]
#
# glb_models_lst <- list(); glb_models_df <- data.frame()
#
if (glb_is_classification && glb_is_binomial &&
(length(unique(glb_fitobs_df[, glb_rsp_var])) < 2))
stop("glb_fitobs_df$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glb_fitobs_df[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
# while(length(max_cor_y_x_vars) < 2) {
# max_cor_y_x_vars <- c(max_cor_y_x_vars, orderBy(~ -cor.y.abs,
# subset(glb_feats_df, (exclude.as.feat == 0) & !is.cor.y.abs.low))[3, "id"])
# }
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a lower correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Baseline
if (!is.null(glb_Baseline_mdl_var))
ret_lst <- myfit_mdl_fn(model_id="Baseline", model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
ret_lst <- myfit_mdl(model_id="MFO",
model_method=ifelse(glb_is_regression, "lm", "myMFO_classfr"),
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: MFO.myMFO_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] N Y
## Levels: N Y
## [1] "unique.prob:"
## y
## N Y
## 0.5 0.5
## [1] "MFO.val:"
## [1] "N"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 -none- numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.5 0.5
## 2 0.5 0.5
## 3 0.5 0.5
## 4 0.5 0.5
## 5 0.5 0.5
## 6 0.5 0.5
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.MFO.myMFO_classfr.Y
## 1 N 163
## 2 Y 163
## Prediction
## Reference N Y
## N 0 163
## Y 0 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.444047e-01 5.555953e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.220784e-01 6.820624e-37
## [1] " calling mypredict_mdl for OOB:"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.5 0.5
## 2 0.5 0.5
## 3 0.5 0.5
## 4 0.5 0.5
## 5 0.5 0.5
## 6 0.5 0.5
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.MFO.myMFO_classfr.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method feats max.nTuningRuns
## 1 MFO.myMFO_classfr myMFO_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.462 0.002 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.6666667 0.5
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.4444047 0.5555953 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4320488 0.5679512 0
if (glb_is_classification)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
ret_lst <- myfit_mdl(model_id="Random", model_method="myrandom_classfr",
model_type=glb_model_type,
indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Random.myrandom_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## [1] "in Random.Classifier$prob"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Random.myrandom_classfr.Y
## 1 N 163
## 2 Y 163
## Prediction
## Reference N Y
## N 0 163
## Y 0 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.444047e-01 5.555953e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.220784e-01 6.820624e-37
## [1] " calling mypredict_mdl for OOB:"
## [1] "in Random.Classifier$prob"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.Random.myrandom_classfr.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method feats max.nTuningRuns
## 1 Random.myrandom_classfr myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.255 0.001 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.6666667 0.5
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.4444047 0.5555953 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4320488 0.5679512 0
# Any models that have tuning parameters has "better" results with cross-validation
# (except rf) & "different" results for different outcome metrics
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Max.cor.Y.cv.0.rpart"
## [1] " indep_vars: R.npnct19.log, R.T.dish"
## Loading required package: rpart
## Fitting cp = 0.209 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 326
##
## CP nsplit rel error
## 1 0.208589 0 1
##
## Node number 1: 326 observations
## predicted class=N expected loss=0.5 P(node) =1
## class counts: 163 163
## probabilities: 0.500 0.500
##
## n= 326
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 326 163 N (0.5000000 0.5000000) *
## [1] " calling mypredict_mdl for fit:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.cv.0.rpart.Y
## 1 N 163
## 2 Y 163
## Prediction
## Reference N Y
## N 0 163
## Y 0 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.444047e-01 5.555953e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.220784e-01 6.820624e-37
## [1] " calling mypredict_mdl for OOB:"
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.cv.0.rpart.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method feats
## 1 Max.cor.Y.cv.0.rpart rpart R.npnct19.log, R.T.dish
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 0.617 0.014
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.5 0.6666667 0.5
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.4444047 0.5555953 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.5 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4320488 0.5679512 0
ret_lst <- myfit_mdl(model_id="Max.cor.Y.cv.0.cp.0",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=0,
tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
## [1] "fitting model: Max.cor.Y.cv.0.cp.0.rpart"
## [1] " indep_vars: R.npnct19.log, R.T.dish"
## Fitting cp = 0 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 326
##
## CP nsplit rel error
## 1 0.208588957 0 1.0000000
## 2 0.014314928 1 0.7914110
## 3 0.012269939 8 0.6809816
## 4 0.003067485 9 0.6687117
## 5 0.002044990 11 0.6625767
## 6 0.000000000 14 0.6564417
##
## Variable importance
## R.npnct19.log R.T.dish
## 67 33
##
## Node number 1: 326 observations, complexity param=0.208589
## predicted class=N expected loss=0.5 P(node) =1
## class counts: 163 163
## probabilities: 0.500 0.500
## left son=2 (108 obs) right son=3 (218 obs)
## Primary splits:
## R.npnct19.log < 0.3465736 to the left, improve=8.003228, (0 missing)
## R.T.dish < 0.0004890173 to the left, improve=4.809997, (0 missing)
##
## Node number 2: 108 observations, complexity param=0.003067485
## predicted class=N expected loss=0.3425926 P(node) =0.3312883
## class counts: 71 37
## probabilities: 0.657 0.343
## left son=4 (85 obs) right son=5 (23 obs)
## Primary splits:
## R.T.dish < 0.00426794 to the left, improve=0.4967415, (0 missing)
##
## Node number 3: 218 observations, complexity param=0.01431493
## predicted class=Y expected loss=0.4220183 P(node) =0.6687117
## class counts: 92 126
## probabilities: 0.422 0.578
## left son=6 (153 obs) right son=7 (65 obs)
## Primary splits:
## R.T.dish < 0.003931506 to the left, improve=3.116447, (0 missing)
## R.npnct19.log < 2.441401 to the left, improve=1.861015, (0 missing)
##
## Node number 4: 85 observations
## predicted class=N expected loss=0.3176471 P(node) =0.2607362
## class counts: 58 27
## probabilities: 0.682 0.318
##
## Node number 5: 23 observations, complexity param=0.003067485
## predicted class=N expected loss=0.4347826 P(node) =0.07055215
## class counts: 13 10
## probabilities: 0.565 0.435
## left son=10 (16 obs) right son=11 (7 obs)
## Primary splits:
## R.T.dish < 0.0065798 to the right, improve=0.3757764, (0 missing)
##
## Node number 6: 153 observations, complexity param=0.01431493
## predicted class=Y expected loss=0.4771242 P(node) =0.4693252
## class counts: 73 80
## probabilities: 0.477 0.523
## left son=12 (146 obs) right son=13 (7 obs)
## Primary splits:
## R.npnct19.log < 3.067782 to the left, improve=3.339869, (0 missing)
## R.T.dish < 0.00316708 to the right, improve=0.543166, (0 missing)
##
## Node number 7: 65 observations, complexity param=0.00204499
## predicted class=Y expected loss=0.2923077 P(node) =0.1993865
## class counts: 19 46
## probabilities: 0.292 0.708
## left son=14 (58 obs) right son=15 (7 obs)
## Primary splits:
## R.T.dish < 0.004392989 to the right, improve=1.3405840, (0 missing)
## R.npnct19.log < 1.868835 to the right, improve=0.1923077, (0 missing)
##
## Node number 10: 16 observations
## predicted class=N expected loss=0.375 P(node) =0.04907975
## class counts: 10 6
## probabilities: 0.625 0.375
##
## Node number 11: 7 observations
## predicted class=Y expected loss=0.4285714 P(node) =0.02147239
## class counts: 3 4
## probabilities: 0.429 0.571
##
## Node number 12: 146 observations, complexity param=0.01431493
## predicted class=N expected loss=0.5 P(node) =0.4478528
## class counts: 73 73
## probabilities: 0.500 0.500
## left son=24 (123 obs) right son=25 (23 obs)
## Primary splits:
## R.npnct19.log < 2.441401 to the left, improve=1.2644040, (0 missing)
## R.T.dish < 0.00316708 to the right, improve=0.7263682, (0 missing)
##
## Node number 13: 7 observations
## predicted class=Y expected loss=0 P(node) =0.02147239
## class counts: 0 7
## probabilities: 0.000 1.000
##
## Node number 14: 58 observations, complexity param=0.00204499
## predicted class=Y expected loss=0.3275862 P(node) =0.1779141
## class counts: 19 39
## probabilities: 0.328 0.672
## left son=28 (48 obs) right son=29 (10 obs)
## Primary splits:
## R.T.dish < 0.01122521 to the left, improve=0.3933908, (0 missing)
## R.npnct19.log < 2.249905 to the right, improve=0.2401299, (0 missing)
##
## Node number 15: 7 observations
## predicted class=Y expected loss=0 P(node) =0.02147239
## class counts: 0 7
## probabilities: 0.000 1.000
##
## Node number 24: 123 observations, complexity param=0.01431493
## predicted class=N expected loss=0.4715447 P(node) =0.3773006
## class counts: 65 58
## probabilities: 0.528 0.472
## left son=48 (8 obs) right son=49 (115 obs)
## Primary splits:
## R.npnct19.log < 2.138333 to the right, improve=0.8399434, (0 missing)
## R.T.dish < 0.0007645621 to the right, improve=0.4741463, (0 missing)
##
## Node number 25: 23 observations, complexity param=0.01226994
## predicted class=Y expected loss=0.3478261 P(node) =0.07055215
## class counts: 8 15
## probabilities: 0.348 0.652
## left son=50 (10 obs) right son=51 (13 obs)
## Primary splits:
## R.npnct19.log < 2.740319 to the right, improve=2.250167, (0 missing)
## R.T.dish < 0.0004695535 to the left, improve=1.276052, (0 missing)
## Surrogate splits:
## R.T.dish < 0.0004837056 to the left, agree=0.652, adj=0.2, (0 split)
##
## Node number 28: 48 observations, complexity param=0.00204499
## predicted class=Y expected loss=0.3541667 P(node) =0.1472393
## class counts: 17 31
## probabilities: 0.354 0.646
## left son=56 (9 obs) right son=57 (39 obs)
## Primary splits:
## R.npnct19.log < 0.8958797 to the left, improve=0.8985043, (0 missing)
## R.T.dish < 0.007454815 to the right, improve=0.8406863, (0 missing)
##
## Node number 29: 10 observations
## predicted class=Y expected loss=0.2 P(node) =0.03067485
## class counts: 2 8
## probabilities: 0.200 0.800
##
## Node number 48: 8 observations
## predicted class=N expected loss=0.25 P(node) =0.02453988
## class counts: 6 2
## probabilities: 0.750 0.250
##
## Node number 49: 115 observations, complexity param=0.01431493
## predicted class=N expected loss=0.4869565 P(node) =0.3527607
## class counts: 59 56
## probabilities: 0.513 0.487
## left son=98 (36 obs) right son=99 (79 obs)
## Primary splits:
## R.npnct19.log < 0.8958797 to the left, improve=0.5178316, (0 missing)
## R.T.dish < 0.0007645621 to the right, improve=0.4333408, (0 missing)
##
## Node number 50: 10 observations
## predicted class=N expected loss=0.4 P(node) =0.03067485
## class counts: 6 4
## probabilities: 0.600 0.400
##
## Node number 51: 13 observations
## predicted class=Y expected loss=0.1538462 P(node) =0.0398773
## class counts: 2 11
## probabilities: 0.154 0.846
##
## Node number 56: 9 observations
## predicted class=N expected loss=0.4444444 P(node) =0.02760736
## class counts: 5 4
## probabilities: 0.556 0.444
##
## Node number 57: 39 observations
## predicted class=Y expected loss=0.3076923 P(node) =0.1196319
## class counts: 12 27
## probabilities: 0.308 0.692
##
## Node number 98: 36 observations
## predicted class=N expected loss=0.4166667 P(node) =0.1104294
## class counts: 21 15
## probabilities: 0.583 0.417
##
## Node number 99: 79 observations, complexity param=0.01431493
## predicted class=Y expected loss=0.4810127 P(node) =0.2423313
## class counts: 38 41
## probabilities: 0.481 0.519
## left son=198 (38 obs) right son=199 (41 obs)
## Primary splits:
## R.T.dish < 0.0007645621 to the right, improve=0.7511253, (0 missing)
## R.npnct19.log < 1.242453 to the left, improve=0.1154026, (0 missing)
## Surrogate splits:
## R.npnct19.log < 1.497866 to the right, agree=0.532, adj=0.026, (0 split)
##
## Node number 198: 38 observations, complexity param=0.01431493
## predicted class=N expected loss=0.4473684 P(node) =0.1165644
## class counts: 21 17
## probabilities: 0.553 0.447
## left son=396 (24 obs) right son=397 (14 obs)
## Primary splits:
## R.T.dish < 0.00284298 to the left, improve=1.6942360, (0 missing)
## R.npnct19.log < 1.242453 to the left, improve=0.7710121, (0 missing)
##
## Node number 199: 41 observations
## predicted class=Y expected loss=0.4146341 P(node) =0.1257669
## class counts: 17 24
## probabilities: 0.415 0.585
##
## Node number 396: 24 observations
## predicted class=N expected loss=0.3333333 P(node) =0.07361963
## class counts: 16 8
## probabilities: 0.667 0.333
##
## Node number 397: 14 observations
## predicted class=Y expected loss=0.3571429 P(node) =0.04294479
## class counts: 5 9
## probabilities: 0.357 0.643
##
## n= 326
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 326 163 N (0.5000000 0.5000000)
## 2) R.npnct19.log< 0.3465736 108 37 N (0.6574074 0.3425926)
## 4) R.T.dish< 0.00426794 85 27 N (0.6823529 0.3176471) *
## 5) R.T.dish>=0.00426794 23 10 N (0.5652174 0.4347826)
## 10) R.T.dish>=0.0065798 16 6 N (0.6250000 0.3750000) *
## 11) R.T.dish< 0.0065798 7 3 Y (0.4285714 0.5714286) *
## 3) R.npnct19.log>=0.3465736 218 92 Y (0.4220183 0.5779817)
## 6) R.T.dish< 0.003931506 153 73 Y (0.4771242 0.5228758)
## 12) R.npnct19.log< 3.067782 146 73 N (0.5000000 0.5000000)
## 24) R.npnct19.log< 2.441401 123 58 N (0.5284553 0.4715447)
## 48) R.npnct19.log>=2.138333 8 2 N (0.7500000 0.2500000) *
## 49) R.npnct19.log< 2.138333 115 56 N (0.5130435 0.4869565)
## 98) R.npnct19.log< 0.8958797 36 15 N (0.5833333 0.4166667) *
## 99) R.npnct19.log>=0.8958797 79 38 Y (0.4810127 0.5189873)
## 198) R.T.dish>=0.0007645621 38 17 N (0.5526316 0.4473684)
## 396) R.T.dish< 0.00284298 24 8 N (0.6666667 0.3333333) *
## 397) R.T.dish>=0.00284298 14 5 Y (0.3571429 0.6428571) *
## 199) R.T.dish< 0.0007645621 41 17 Y (0.4146341 0.5853659) *
## 25) R.npnct19.log>=2.441401 23 8 Y (0.3478261 0.6521739)
## 50) R.npnct19.log>=2.740319 10 4 N (0.6000000 0.4000000) *
## 51) R.npnct19.log< 2.740319 13 2 Y (0.1538462 0.8461538) *
## 13) R.npnct19.log>=3.067782 7 0 Y (0.0000000 1.0000000) *
## 7) R.T.dish>=0.003931506 65 19 Y (0.2923077 0.7076923)
## 14) R.T.dish>=0.004392989 58 19 Y (0.3275862 0.6724138)
## 28) R.T.dish< 0.01122521 48 17 Y (0.3541667 0.6458333)
## 56) R.npnct19.log< 0.8958797 9 4 N (0.5555556 0.4444444) *
## 57) R.npnct19.log>=0.8958797 39 12 Y (0.3076923 0.6923077) *
## 29) R.T.dish>=0.01122521 10 2 Y (0.2000000 0.8000000) *
## 15) R.T.dish< 0.004392989 7 0 Y (0.0000000 1.0000000) *
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6666667
## 3 0.2 0.6666667
## 4 0.3 0.6694387
## 5 0.4 0.6705202
## 6 0.5 0.6445183
## 7 0.6 0.5454545
## 8 0.7 0.3300000
## 9 0.8 0.3300000
## 10 0.9 0.1581921
## 11 1.0 0.1581921
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.N
## 1 N 96
## 2 Y 47
## dirty.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.Y
## 1 67
## 2 116
## Prediction
## Reference N Y
## N 96 67
## Y 47 116
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.503067e-01 3.006135e-01 5.958226e-01 7.020312e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 3.118365e-08 7.515569e-02
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.66666667
## 2 0.1 0.66666667
## 3 0.2 0.66666667
## 4 0.3 0.65230769
## 5 0.4 0.61603376
## 6 0.5 0.58883249
## 7 0.6 0.53571429
## 8 0.7 0.26470588
## 9 0.8 0.26470588
## 10 0.9 0.08196721
## 11 1.0 0.08196721
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.cv.0.cp.0.rpart.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method feats
## 1 Max.cor.Y.cv.0.cp.0.rpart rpart R.npnct19.log, R.T.dish
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 0.47 0.018
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.7135383 0.4 0.6705202 0.6503067
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.5958226 0.7020312 0.3006135 0.6429752
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4320488 0.5679512 0
if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method="rpart",
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.rpart"
## [1] " indep_vars: R.npnct19.log, R.T.dish"
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0123 on full training set
## Warning in myfit_mdl(model_id = "Max.cor.Y", model_method = "rpart",
## model_type = glb_model_type, : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 326
##
## CP nsplit rel error
## 1 0.20858896 0 1.0000000
## 2 0.01431493 1 0.7914110
## 3 0.01226994 8 0.6809816
##
## Variable importance
## R.npnct19.log R.T.dish
## 72 28
##
## Node number 1: 326 observations, complexity param=0.208589
## predicted class=N expected loss=0.5 P(node) =1
## class counts: 163 163
## probabilities: 0.500 0.500
## left son=2 (108 obs) right son=3 (218 obs)
## Primary splits:
## R.npnct19.log < 0.3465736 to the left, improve=8.003228, (0 missing)
## R.T.dish < 0.0004890173 to the left, improve=4.809997, (0 missing)
##
## Node number 2: 108 observations
## predicted class=N expected loss=0.3425926 P(node) =0.3312883
## class counts: 71 37
## probabilities: 0.657 0.343
##
## Node number 3: 218 observations, complexity param=0.01431493
## predicted class=Y expected loss=0.4220183 P(node) =0.6687117
## class counts: 92 126
## probabilities: 0.422 0.578
## left son=6 (153 obs) right son=7 (65 obs)
## Primary splits:
## R.T.dish < 0.003931506 to the left, improve=3.116447, (0 missing)
## R.npnct19.log < 2.441401 to the left, improve=1.861015, (0 missing)
##
## Node number 6: 153 observations, complexity param=0.01431493
## predicted class=Y expected loss=0.4771242 P(node) =0.4693252
## class counts: 73 80
## probabilities: 0.477 0.523
## left son=12 (146 obs) right son=13 (7 obs)
## Primary splits:
## R.npnct19.log < 3.067782 to the left, improve=3.339869, (0 missing)
## R.T.dish < 0.00316708 to the right, improve=0.543166, (0 missing)
##
## Node number 7: 65 observations
## predicted class=Y expected loss=0.2923077 P(node) =0.1993865
## class counts: 19 46
## probabilities: 0.292 0.708
##
## Node number 12: 146 observations, complexity param=0.01431493
## predicted class=N expected loss=0.5 P(node) =0.4478528
## class counts: 73 73
## probabilities: 0.500 0.500
## left son=24 (123 obs) right son=25 (23 obs)
## Primary splits:
## R.npnct19.log < 2.441401 to the left, improve=1.2644040, (0 missing)
## R.T.dish < 0.00316708 to the right, improve=0.7263682, (0 missing)
##
## Node number 13: 7 observations
## predicted class=Y expected loss=0 P(node) =0.02147239
## class counts: 0 7
## probabilities: 0.000 1.000
##
## Node number 24: 123 observations, complexity param=0.01431493
## predicted class=N expected loss=0.4715447 P(node) =0.3773006
## class counts: 65 58
## probabilities: 0.528 0.472
## left son=48 (8 obs) right son=49 (115 obs)
## Primary splits:
## R.npnct19.log < 2.138333 to the right, improve=0.8399434, (0 missing)
## R.T.dish < 0.0007645621 to the right, improve=0.4741463, (0 missing)
##
## Node number 25: 23 observations
## predicted class=Y expected loss=0.3478261 P(node) =0.07055215
## class counts: 8 15
## probabilities: 0.348 0.652
##
## Node number 48: 8 observations
## predicted class=N expected loss=0.25 P(node) =0.02453988
## class counts: 6 2
## probabilities: 0.750 0.250
##
## Node number 49: 115 observations, complexity param=0.01431493
## predicted class=N expected loss=0.4869565 P(node) =0.3527607
## class counts: 59 56
## probabilities: 0.513 0.487
## left son=98 (36 obs) right son=99 (79 obs)
## Primary splits:
## R.npnct19.log < 0.8958797 to the left, improve=0.5178316, (0 missing)
## R.T.dish < 0.0007645621 to the right, improve=0.4333408, (0 missing)
##
## Node number 98: 36 observations
## predicted class=N expected loss=0.4166667 P(node) =0.1104294
## class counts: 21 15
## probabilities: 0.583 0.417
##
## Node number 99: 79 observations, complexity param=0.01431493
## predicted class=Y expected loss=0.4810127 P(node) =0.2423313
## class counts: 38 41
## probabilities: 0.481 0.519
## left son=198 (38 obs) right son=199 (41 obs)
## Primary splits:
## R.T.dish < 0.0007645621 to the right, improve=0.7511253, (0 missing)
## R.npnct19.log < 1.242453 to the left, improve=0.1154026, (0 missing)
## Surrogate splits:
## R.npnct19.log < 1.497866 to the right, agree=0.532, adj=0.026, (0 split)
##
## Node number 198: 38 observations, complexity param=0.01431493
## predicted class=N expected loss=0.4473684 P(node) =0.1165644
## class counts: 21 17
## probabilities: 0.553 0.447
## left son=396 (24 obs) right son=397 (14 obs)
## Primary splits:
## R.T.dish < 0.00284298 to the left, improve=1.6942360, (0 missing)
## R.npnct19.log < 1.242453 to the left, improve=0.7710121, (0 missing)
##
## Node number 199: 41 observations
## predicted class=Y expected loss=0.4146341 P(node) =0.1257669
## class counts: 17 24
## probabilities: 0.415 0.585
##
## Node number 396: 24 observations
## predicted class=N expected loss=0.3333333 P(node) =0.07361963
## class counts: 16 8
## probabilities: 0.667 0.333
##
## Node number 397: 14 observations
## predicted class=Y expected loss=0.3571429 P(node) =0.04294479
## class counts: 5 9
## probabilities: 0.357 0.643
##
## n= 326
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 326 163 N (0.5000000 0.5000000)
## 2) R.npnct19.log< 0.3465736 108 37 N (0.6574074 0.3425926) *
## 3) R.npnct19.log>=0.3465736 218 92 Y (0.4220183 0.5779817)
## 6) R.T.dish< 0.003931506 153 73 Y (0.4771242 0.5228758)
## 12) R.npnct19.log< 3.067782 146 73 N (0.5000000 0.5000000)
## 24) R.npnct19.log< 2.441401 123 58 N (0.5284553 0.4715447)
## 48) R.npnct19.log>=2.138333 8 2 N (0.7500000 0.2500000) *
## 49) R.npnct19.log< 2.138333 115 56 N (0.5130435 0.4869565)
## 98) R.npnct19.log< 0.8958797 36 15 N (0.5833333 0.4166667) *
## 99) R.npnct19.log>=0.8958797 79 38 Y (0.4810127 0.5189873)
## 198) R.T.dish>=0.0007645621 38 17 N (0.5526316 0.4473684)
## 396) R.T.dish< 0.00284298 24 8 N (0.6666667 0.3333333) *
## 397) R.T.dish>=0.00284298 14 5 Y (0.3571429 0.6428571) *
## 199) R.T.dish< 0.0007645621 41 17 Y (0.4146341 0.5853659) *
## 25) R.npnct19.log>=2.441401 23 8 Y (0.3478261 0.6521739) *
## 13) R.npnct19.log>=3.067782 7 0 Y (0.0000000 1.0000000) *
## 7) R.T.dish>=0.003931506 65 19 Y (0.2923077 0.7076923) *
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.66666667
## 2 0.1 0.66666667
## 3 0.2 0.66666667
## 4 0.3 0.66943867
## 5 0.4 0.66475645
## 6 0.5 0.64536741
## 7 0.6 0.56617647
## 8 0.7 0.45106383
## 9 0.8 0.08235294
## 10 0.9 0.08235294
## 11 1.0 0.08235294
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.rpart.N
## 1 N 6
## 2 Y 2
## dirty.fctr.predict.Max.cor.Y.rpart.Y
## 1 157
## 2 161
## Prediction
## Reference N Y
## N 6 157
## Y 2 161
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.122699e-01 2.453988e-02 4.565784e-01 5.677366e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 3.491525e-01 2.649549e-34
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.66666667
## 2 0.1 0.66666667
## 3 0.2 0.66666667
## 4 0.3 0.65230769
## 5 0.4 0.61666667
## 6 0.5 0.60287081
## 7 0.6 0.55248619
## 8 0.7 0.48192771
## 9 0.8 0.03508772
## 10 0.9 0.03508772
## 11 1.0 0.03508772
## [1] "Classifier Probability Threshold: 0.2000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.rpart.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.rpart rpart R.npnct19.log, R.T.dish 3
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.972 0.014 0.6865708
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.6694387 0.567284
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.4565784 0.5677366 0.1345679 0.6256612
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4320488 0.5679512 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.03658677 0.07317354
# Used to compare vs. Interactions.High.cor.Y and/or Max.cor.Y.TmSrs
ret_lst <- myfit_mdl(model_id="Max.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr=max_cor_y_x_vars,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Max.cor.Y.glm"
## [1] " indep_vars: R.npnct19.log, R.T.dish"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7696 -1.1050 -0.1613 1.1058 1.4855
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7002 0.1871 -3.742 0.000182 ***
## R.npnct19.log 0.4857 0.1220 3.981 6.86e-05 ***
## R.T.dish 71.7182 29.8330 2.404 0.016217 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 451.93 on 325 degrees of freedom
## Residual deviance: 428.09 on 323 degrees of freedom
## AIC: 434.09
##
## Number of Fisher Scoring iterations: 4
##
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.66666667
## 2 0.1 0.66666667
## 3 0.2 0.66666667
## 4 0.3 0.66666667
## 5 0.4 0.67326733
## 6 0.5 0.61442006
## 7 0.6 0.46530612
## 8 0.7 0.22105263
## 9 0.8 0.02424242
## 10 0.9 0.00000000
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.glm.N
## 1 N 58
## 2 Y 27
## dirty.fctr.predict.Max.cor.Y.glm.Y
## 1 105
## 2 136
## Prediction
## Reference N Y
## N 58 105
## Y 27 136
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.950920e-01 1.901840e-01 5.396076e-01 6.488323e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 3.525528e-04 2.056005e-11
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.66666667
## 2 0.1 0.66666667
## 3 0.2 0.66666667
## 4 0.3 0.66666667
## 5 0.4 0.66425993
## 6 0.5 0.62443439
## 7 0.6 0.54857143
## 8 0.7 0.22047244
## 9 0.8 0.01801802
## 10 0.9 0.00000000
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.Max.cor.Y.glm.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method feats max.nTuningRuns
## 1 Max.cor.Y.glm glm R.npnct19.log, R.T.dish 1
## min.elapsedtime.everything min.elapsedtime.final max.auc.fit
## 1 0.931 0.014 0.6527532
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.6732673 0.6010101
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.5396076 0.6488323 0.2020202 0.681281
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.4320488 0.5679512 0 434.0906
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.03578184 0.07156367
if (!is.null(glb_date_vars) &&
(sum(grepl(paste(glb_date_vars, "\\.day\\.minutes\\.poly\\.", sep=""),
names(glb_allobs_df))) > 0)) {
# ret_lst <- myfit_mdl(model_id="Max.cor.Y.TmSrs.poly1",
# model_method=ifelse(glb_is_regression, "lm",
# ifelse(glb_is_binomial, "glm", "rpart")),
# model_type=glb_model_type,
# indep_vars_vctr=c(max_cor_y_x_vars, paste0(glb_date_vars, ".day.minutes")),
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
#
ret_lst <- myfit_mdl(model_id="Max.cor.Y.TmSrs.poly",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr=c(max_cor_y_x_vars,
grep(paste(glb_date_vars, "\\.day\\.minutes\\.poly\\.", sep=""),
names(glb_allobs_df), value=TRUE)),
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
}
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(unique(glb_feats_df$cor.high.X), NA)) > 0) {
# lm & glm handle interaction terms; rpart & rf do not
if (glb_is_regression || glb_is_binomial) {
indep_vars_vctr <-
c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":"))
} else { indep_vars_vctr <- union(max_cor_y_x_vars, int_feats) }
ret_lst <- myfit_mdl(model_id="Interact.High.cor.Y",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
}
## [1] "fitting model: Interact.High.cor.Y.glm"
## [1] " indep_vars: R.npnct19.log, R.T.dish, R.npnct19.log:R.npnct19.log, R.npnct19.log:R.npnct07.log, R.npnct19.log:R.nwrds.log, R.npnct19.log:R.T.sum"
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0829 -1.0673 -0.2586 1.0945 1.8562
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7939 0.2007 -3.956 7.61e-05 ***
## R.npnct19.log 3.4572 1.2322 2.806 0.00502 **
## R.T.dish 73.7075 30.4942 2.417 0.01564 *
## `R.npnct19.log:R.npnct07.log` 0.5430 0.2690 2.019 0.04349 *
## `R.npnct19.log:R.nwrds.log` -0.6587 0.2779 -2.370 0.01780 *
## `R.npnct19.log:R.T.sum` 10.6543 8.0947 1.316 0.18810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 451.93 on 325 degrees of freedom
## Residual deviance: 417.24 on 320 degrees of freedom
## AIC: 429.24
##
## Number of Fisher Scoring iterations: 5
##
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.66666667
## 2 0.1 0.66666667
## 3 0.2 0.66393443
## 4 0.3 0.66529774
## 5 0.4 0.67875648
## 6 0.5 0.64174455
## 7 0.6 0.50592885
## 8 0.7 0.27272727
## 9 0.8 0.11363636
## 10 0.9 0.05952381
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Interact.High.cor.Y.glm.N
## 1 N 71
## 2 Y 32
## dirty.fctr.predict.Interact.High.cor.Y.glm.Y
## 1 92
## 2 131
## Prediction
## Reference N Y
## N 71 92
## Y 32 131
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.196319e-01 2.392638e-01 5.645033e-01 6.725654e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 9.188038e-06 1.168497e-07
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.66666667
## 2 0.1 0.66666667
## 3 0.2 0.66666667
## 4 0.3 0.66666667
## 5 0.4 0.68656716
## 6 0.5 0.64035088
## 7 0.6 0.55681818
## 8 0.7 0.29629630
## 9 0.8 0.13445378
## 10 0.9 0.07017544
## 11 1.0 0.00000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.Interact.High.cor.Y.glm.N
## 1 N 44
## 2 Y 18
## dirty.fctr.predict.Interact.High.cor.Y.glm.Y
## 1 66
## 2 92
## Prediction
## Reference N Y
## N 44 66
## Y 18 92
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.181818e-01 2.363636e-01 5.504658e-01 6.826667e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 2.774058e-04 2.926492e-07
## model_id model_method
## 1 Interact.High.cor.Y.glm glm
## feats
## 1 R.npnct19.log, R.T.dish, R.npnct19.log:R.npnct19.log, R.npnct19.log:R.npnct07.log, R.npnct19.log:R.nwrds.log, R.npnct19.log:R.T.sum
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 0.922 0.015
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.6851782 0.4 0.6787565 0.6228395
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.5645033 0.6725654 0.245679 0.6892975
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.4 0.6865672 0.6181818
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.5504658 0.6826667 0.2363636 429.2393
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0633521 0.1267042
# Low.cor.X
# if (glb_is_classification && glb_is_binomial)
# indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) &
# is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"] else
indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) & !myNearZV &
(exclude.as.feat != 1))[, "id"]
myadjust_interaction_feats <- function(vars_vctr) {
for (feat in subset(glb_feats_df, !is.na(interaction.feat))$id)
if (feat %in% vars_vctr)
vars_vctr <- union(setdiff(vars_vctr, feat),
paste0(glb_feats_df[glb_feats_df$id == feat, "interaction.feat"], ":", feat))
return(vars_vctr)
}
indep_vars_vctr <- myadjust_interaction_feats(indep_vars_vctr)
ret_lst <- myfit_mdl(model_id="Low.cor.X",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=NULL)
## [1] "fitting model: Low.cor.X.glm"
## [1] " indep_vars: R.npnct19.log, R.T.dish, R.T.pork, R.T.noodl, R.npnct14.log, R.npnct04.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees"
## Aggregating results
## Fitting final model on full training set
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.24662 -0.75823 0.02299 0.74738 2.56534
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.58173 4.94888 0.118 0.90643
## R.npnct19.log 0.88132 0.26903 3.276 0.00105 **
## R.T.dish -11.53185 56.42162 -0.204 0.83805
## R.T.pork 48.20956 38.00760 1.268 0.20465
## R.T.noodl 59.95014 28.58933 2.097 0.03600 *
## R.npnct14.log 0.43694 0.30244 1.445 0.14854
## R.npnct04.log 0.07571 0.26785 0.283 0.77745
## R.T.rice 13.40108 36.31126 0.369 0.71208
## R.T.thai 0.47653 8.28141 0.058 0.95411
## R.npnct30.log 0.69052 0.87791 0.787 0.43154
## R.npnct05.log 1.21449 0.56474 2.151 0.03152 *
## R.T.chines 7.89060 15.08415 0.523 0.60090
## R.npnct09.log 0.29356 0.29516 0.995 0.31995
## R.npnct03.log -0.20251 0.70321 -0.288 0.77337
## R.T.roll 46.43015 26.76511 1.735 0.08279 .
## R.T.chicken 16.23199 32.41126 0.501 0.61650
## .rnorm 0.58871 0.18218 3.231 0.00123 **
## R.npnct18.log -0.81129 0.42132 -1.926 0.05416 .
## R.T.seattl 33.52881 55.51722 0.604 0.54589
## R.npnct21.log 0.39105 2.78040 0.141 0.88815
## R.T.vietnames 0.11086 12.03646 0.009 0.99265
## R.T.cheap 26.61250 33.95195 0.784 0.43314
## R.T.bbq -7.46045 29.94227 -0.249 0.80324
## R.T.park 29.57944 27.13995 1.090 0.27576
## R.npnct10.log 0.26046 0.44100 0.591 0.55479
## R.T.egg -1.59549 27.47853 -0.058 0.95370
## R.T.tofu 17.15550 19.86959 0.863 0.38792
## R.T.best 27.90415 35.70471 0.782 0.43449
## R.P.http -0.29126 0.54776 -0.532 0.59491
## R.npnct23.log 0.50230 2.76070 0.182 0.85562
## R.T.soup -37.63497 45.63322 -0.825 0.40953
## R.T.great 14.86915 52.14021 0.285 0.77551
## R.npnct16.log -0.52633 0.39489 -1.333 0.18258
## R.T.get 142.21417 67.33566 2.112 0.03468 *
## R.T.fri 3.05204 30.84995 0.099 0.92119
## R.T.pho -1.60526 6.67178 -0.241 0.80986
## R.T.love -38.54674 59.82397 -0.644 0.51936
## R.T.food 71.49765 92.46410 0.773 0.43938
## R.T.time 5.07293 83.31990 0.061 0.95145
## R.T.restaur 14.20694 38.64962 0.368 0.71318
## R.npnct06.log 0.09357 0.41496 0.225 0.82160
## R.T.tri 99.74937 60.94345 1.637 0.10168
## R.T.sauc 60.76601 51.54084 1.179 0.23840
## R.T.servic -16.22361 55.05025 -0.295 0.76822
## R.T.ive 22.80423 40.77760 0.559 0.57600
## R.T.can -60.41680 72.60755 -0.832 0.40535
## R.npnct20.log -1.97416 1.18533 -1.665 0.09581 .
## R.T.menu 24.37031 43.97606 0.554 0.57946
## R.T.lunch 15.10753 34.36392 0.440 0.66020
## R.P.year.colon 0.21022 2.20675 0.095 0.92411
## R.T.place 35.98049 108.40424 0.332 0.73996
## R.T.tabl -156.81201 57.56476 -2.724 0.00645 **
## R.T.fresh 79.47292 48.03868 1.654 0.09806 .
## R.T.one 28.41995 68.87160 0.413 0.67986
## R.T.breakfast -11.82103 19.64005 -0.602 0.54725
## R.sum.TfIdf 0.22030 0.79208 0.278 0.78092
## R.T.hour 22.53677 65.97051 0.342 0.73264
## R.T.tast -78.85950 56.23906 -1.402 0.16085
## R.T.order 1.39621 57.80035 0.024 0.98073
## R.T.coffe 24.68523 15.50782 1.592 0.11143
## R.T.night 8.94761 47.17573 0.190 0.84957
## R.T.taco -17.73257 20.48375 -0.866 0.38666
## R.T.price -33.10820 57.55810 -0.575 0.56515
## R.ratio.nstopwrds.nwrds -5.57579 7.41741 -0.752 0.45222
## R.T.delici -49.77091 51.62242 -0.964 0.33498
## R.T.friend -140.29480 80.32544 -1.747 0.08071 .
## R.T.pretti -37.74252 61.33181 -0.615 0.53830
## R.T.burger -14.90292 11.60171 -1.285 0.19895
## R.T.sandwich -12.53923 12.15652 -1.031 0.30231
## R.T.cake -27.41491 36.96326 -0.742 0.45828
## R.T.realli -34.25291 75.11095 -0.456 0.64837
## R.T.alway -77.10930 46.96872 -1.642 0.10065
## R.T.good 131.99715 109.76020 1.203 0.22913
## R.T.just -106.33343 81.44285 -1.306 0.19168
## R.T.pizza -3.57567 5.03561 -0.710 0.47766
## R.T.happi -79.47738 60.67822 -1.310 0.19026
## R.T.beer -8.07343 26.57343 -0.304 0.76127
## R.T.drink 13.85488 29.42900 0.471 0.63779
## R.ratio.sum.TfIdf.nwrds -10.43014 15.16809 -0.688 0.49168
## R.T.bar 54.76794 29.67510 1.846 0.06495 .
## R.T.like -197.45459 102.91687 -1.919 0.05504 .
## R.T.salad -48.78718 26.86151 -1.816 0.06933 .
## R.T.nice 29.51515 61.02429 0.484 0.62863
## R.T.chees -11.43353 33.31256 -0.343 0.73143
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 451.93 on 325 degrees of freedom
## Residual deviance: 307.02 on 242 degrees of freedom
## AIC: 475.02
##
## Number of Fisher Scoring iterations: 6
##
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.7004405
## 3 0.2 0.7458432
## 4 0.3 0.7724868
## 5 0.4 0.7840909
## 6 0.5 0.7739938
## 7 0.6 0.7414966
## 8 0.7 0.6792453
## 9 0.8 0.5882353
## 10 0.9 0.4230769
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.4000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Low.cor.X.glm.N
## 1 N 112
## 2 Y 25
## dirty.fctr.predict.Low.cor.X.glm.Y
## 1 51
## 2 138
## Prediction
## Reference N Y
## N 112 51
## Y 25 138
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.668712e-01 5.337423e-01 7.171251e-01 8.116947e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 4.161292e-23 4.134717e-03
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6556291
## 3 0.2 0.6594203
## 4 0.3 0.6461538
## 5 0.4 0.6393443
## 6 0.5 0.6044444
## 7 0.6 0.5615764
## 8 0.7 0.4808743
## 9 0.8 0.4171779
## 10 0.9 0.3129252
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.0000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.Low.cor.X.glm.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method
## 1 Low.cor.X.glm glm
## feats
## 1 R.npnct19.log, R.T.dish, R.T.pork, R.T.noodl, R.npnct14.log, R.npnct04.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 1.187 0.122
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.8547555 0.4 0.7840909 0.5924242
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.7171251 0.8116947 0.1848485 0.6214876
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.4320488 0.5679512 0 475.019
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0650675 0.130135
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 10 fit.models 7 0 132.229 153.799 21.57
## 11 fit.models 7 1 153.799 NA NA
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn")
## label step_major step_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 156.967 NA NA
# Options:
# 1. rpart & rf manual tuning
# 2. rf without pca (default: with pca)
#stop(here); sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df
#glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df
# All X that is not user excluded
# if (glb_is_classification && glb_is_binomial) {
# model_id_pfx <- "Conditional.X"
# # indep_vars_vctr <- setdiff(names(glb_fitobs_df), union(glb_rsp_var, glb_exclude_vars_as_features))
# indep_vars_vctr <- subset(glb_feats_df, is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"]
# } else {
model_id_pfx <- "All.X"
indep_vars_vctr <- subset(glb_feats_df, !myNearZV &
(exclude.as.feat != 1))[, "id"]
# }
indep_vars_vctr <- myadjust_interaction_feats(indep_vars_vctr)
for (method in glb_models_method_vctr) {
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", method), major.inc=TRUE)
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars_vctr <- setdiff(indep_vars_vctr, c(".rnorm"))
model_id <- paste0(model_id_pfx, ".no.rnorm")
} else model_id <- model_id_pfx
ret_lst <- myfit_mdl(model_id=model_id, model_method=method,
indep_vars_vctr=indep_vars_vctr,
model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df)
# If All.X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indep_vars_vctr
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(model_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glb_fitobs_df[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.nchrs.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.nchrs.log", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.npnct14.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.npnct14.log", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.T.scen", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.T.scen", glb_feats_df$id, value=TRUE), ]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.P.first", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.P.first", glb_feats_df$id, value=TRUE), ]
# all.equal(glb_allobs_df$S.nuppr.log, glb_allobs_df$A.nuppr.log)
# all.equal(glb_allobs_df$S.npnct19.log, glb_allobs_df$A.npnct19.log)
# all.equal(glb_allobs_df$S.P.year.colon, glb_allobs_df$A.P.year.colon)
# all.equal(glb_allobs_df$S.T.share, glb_allobs_df$A.T.share)
# all.equal(glb_allobs_df$H.T.clip, glb_allobs_df$H.P.daily.clip.report)
# cor(glb_allobs_df$S.T.herald, glb_allobs_df$S.T.tribun)
# dsp_obs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# dsp_obs(Abstract.contains="[Ss]hare", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glb_allobs_df[, setdiff(names(glb_allobs_df), myfind_chr_cols_df(glb_allobs_df))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(model_id=paste0(model_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
}
## label step_major step_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 156.967 156.983 0.016
## 2 fit.models_1_glm 2 0 156.983 NA NA
## [1] "fitting model: All.X.glm"
## [1] " indep_vars: R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees"
## Aggregating results
## Fitting final model on full training set
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.30739 -0.64787 -0.00061 0.66118 3.16078
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 188.79409 87.05364 2.169 0.030105 *
## R.npnct19.log 1.19966 0.36745 3.265 0.001095 **
## R.T.dish -6.94221 73.13468 -0.095 0.924376
## R.npnct07.log 1.25903 0.55888 2.253 0.024272 *
## R.ndgts.log 0.23420 0.37598 0.623 0.533349
## R.T.pork 63.93702 46.73384 1.368 0.171278
## R.T.noodl 81.74974 31.51899 2.594 0.009496 **
## R.npnct08.log -0.72167 0.40484 -1.783 0.074650 .
## R.npnct01.log 0.16567 0.36916 0.449 0.653589
## review.niso8859.log 0.21027 0.29327 0.717 0.473393
## R.nwrds.log -79.67374 43.47148 -1.833 0.066835 .
## R.nuppr.log -2.40769 1.34545 -1.790 0.073533 .
## R.nwrds.unq.log 14.60033 4.18356 3.490 0.000483 ***
## R.nchrs.log -18.89248 10.29785 -1.835 0.066564 .
## R.npnct13.log 0.72066 1.10218 0.654 0.513208
## R.nstopwrds.log 89.07775 43.20854 2.062 0.039248 *
## R.npnct15.log -0.03452 0.36036 -0.096 0.923681
## R.npnct11.log -0.98858 0.59751 -1.654 0.098029 .
## R.npnct12.log -0.15007 0.38768 -0.387 0.698691
## R.npnct14.log 0.83068 0.38580 2.153 0.031310 *
## R.npnct04.log 0.12528 0.37067 0.338 0.735378
## R.npnct02.log -0.31286 0.27735 -1.128 0.259307
## R.T.rice 24.38648 44.45779 0.549 0.583327
## R.T.thai 13.12777 10.51314 1.249 0.211774
## R.npnct30.log 1.17275 1.00506 1.167 0.243273
## R.npnct05.log 1.48434 0.61436 2.416 0.015689 *
## R.T.dim 10.35703 18.20214 0.569 0.569356
## R.T.chines 42.60930 21.69802 1.964 0.049560 *
## R.npnct09.log 0.61665 0.35173 1.753 0.079566 .
## R.npnct03.log -0.14556 0.72190 -0.202 0.840204
## R.T.roll 71.04011 31.40716 2.262 0.023703 *
## R.T.chicken 1.62842 37.99712 0.043 0.965816
## .rnorm 0.94703 0.23848 3.971 7.16e-05 ***
## R.npnct18.log -0.93260 0.47994 -1.943 0.051998 .
## R.T.seattl 104.42511 71.06663 1.469 0.141725
## R.npnct21.log -2.95886 3.06247 -0.966 0.333960
## R.T.vietnames 16.82548 18.06621 0.931 0.351686
## R.T.cheap 20.35991 36.63630 0.556 0.578395
## R.T.bbq -17.99391 38.89144 -0.463 0.643601
## R.T.park 51.74838 45.32378 1.142 0.253558
## R.npnct10.log 0.46359 0.50179 0.924 0.355552
## R.T.egg -10.45790 30.29362 -0.345 0.729931
## R.T.tofu 20.54089 24.80292 0.828 0.407577
## R.T.best 21.01569 52.29260 0.402 0.687768
## R.P.http -0.09124 0.56747 -0.161 0.872266
## R.npnct23.log 4.22286 3.07908 1.371 0.170228
## R.T.soup -108.93841 57.54569 -1.893 0.058348 .
## R.T.great 136.08554 72.09684 1.888 0.059088 .
## R.npnct16.log -0.58933 0.47567 -1.239 0.215364
## R.T.get 241.67201 89.04773 2.714 0.006648 **
## R.T.fri -1.74843 33.41896 -0.052 0.958275
## R.T.pho -0.36897 7.11774 -0.052 0.958658
## R.T.love -14.92423 88.02144 -0.170 0.865362
## R.T.food 92.45147 111.22066 0.831 0.405836
## R.T.time 51.07279 101.11452 0.505 0.613490
## R.T.restaur -22.10902 50.18167 -0.441 0.659517
## R.npnct06.log 0.24875 0.47787 0.521 0.602678
## R.T.tri 210.05187 78.97800 2.660 0.007823 **
## R.T.sauc 77.71885 64.62491 1.203 0.229126
## R.T.servic 29.40175 76.52148 0.384 0.700809
## R.T.ive 23.73467 56.83746 0.418 0.676248
## R.T.can -50.85935 92.56450 -0.549 0.582698
## R.npnct20.log -1.79310 1.32254 -1.356 0.175163
## R.T.menu 81.01788 57.17865 1.417 0.156505
## R.T.lunch 15.40980 43.28946 0.356 0.721862
## R.P.year.colon -0.28954 3.81774 -0.076 0.939545
## R.T.place 49.41132 129.08230 0.383 0.701876
## R.T.tabl -171.14863 66.76079 -2.564 0.010359 *
## R.T.fresh 150.46643 64.18762 2.344 0.019070 *
## R.T.one -18.54582 91.08474 -0.204 0.838658
## R.T.breakfast -24.92212 21.48178 -1.160 0.245987
## R.sum.TfIdf 0.67848 0.98419 0.689 0.490586
## R.T.hour 47.05742 78.41094 0.600 0.548414
## R.T.tast -63.22811 68.19168 -0.927 0.353817
## R.T.order 106.24494 74.37035 1.429 0.153121
## R.T.coffe 41.66867 24.30083 1.715 0.086400 .
## R.T.night -2.81157 59.03278 -0.048 0.962013
## R.T.taco -30.39188 22.83826 -1.331 0.183273
## R.T.price -31.31761 69.77071 -0.449 0.653529
## R.ratio.nstopwrds.nwrds -225.42016 111.62753 -2.019 0.043446 *
## R.T.delici -2.85499 62.98073 -0.045 0.963843
## R.T.friend -237.18829 103.27408 -2.297 0.021637 *
## R.T.pretti -5.48074 70.55819 -0.078 0.938085
## R.T.burger -17.30024 13.60866 -1.271 0.203634
## R.T.sandwich -17.64580 15.26618 -1.156 0.247732
## R.T.cake -50.73165 45.38062 -1.118 0.263604
## R.T.realli 89.48848 87.87358 1.018 0.308499
## R.T.alway -84.83959 59.53146 -1.425 0.154122
## R.T.good 179.76155 134.98028 1.332 0.182939
## R.T.just -170.28139 99.78368 -1.707 0.087914 .
## R.T.pizza -6.75932 6.49866 -1.040 0.298289
## R.T.happi -125.02361 71.11012 -1.758 0.078719 .
## R.T.beer -29.07964 31.78906 -0.915 0.360313
## R.T.drink 36.37235 34.53657 1.053 0.292270
## R.ratio.sum.TfIdf.nwrds 30.64707 30.18171 1.015 0.309906
## R.T.bar 64.87459 33.75402 1.922 0.054608 .
## R.T.like -272.19702 127.29146 -2.138 0.032486 *
## R.T.salad -58.11500 38.12010 -1.525 0.127378
## R.T.nice 114.00044 82.79769 1.377 0.168557
## R.T.chees -6.85395 40.06525 -0.171 0.864169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 451.93 on 325 degrees of freedom
## Residual deviance: 262.24 on 226 degrees of freedom
## AIC: 462.24
##
## Number of Fisher Scoring iterations: 7
##
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.7476636
## 3 0.2 0.7830424
## 4 0.3 0.8160000
## 5 0.4 0.8125000
## 6 0.5 0.8012232
## 7 0.6 0.7641196
## 8 0.7 0.7482014
## 9 0.8 0.6533865
## 10 0.9 0.5135135
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.All.X.glm.N dirty.fctr.predict.All.X.glm.Y
## 1 N 104 59
## 2 Y 10 153
## Prediction
## Reference N Y
## N 104 59
## Y 10 153
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.883436e-01 5.766871e-01 7.399391e-01 8.314218e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 6.404038e-27 7.536016e-09
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6643357
## 3 0.2 0.6541353
## 4 0.3 0.6328125
## 5 0.4 0.6393443
## 6 0.5 0.6375546
## 7 0.6 0.6090909
## 8 0.7 0.5865385
## 9 0.8 0.5463918
## 10 0.9 0.4337349
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.0000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.All.X.glm.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method
## 1 All.X.glm glm
## feats
## 1 R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 1.355 0.156
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.8962701 0.3 0.816 0.5705387
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.7399391 0.8314218 0.1410774 0.6292562
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.4320488 0.5679512 0 462.2375
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01401762 0.02803523
## label step_major step_minor bgn end elapsed
## 2 fit.models_1_glm 2 0 156.983 161.452 4.469
## 3 fit.models_1_bayesglm 3 0 161.452 NA NA
## [1] "fitting model: All.X.bayesglm"
## [1] " indep_vars: R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees"
## Loading required package: arm
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: Matrix
## Loading required package: lme4
## Loading required package: Rcpp
##
## arm (Version 1.8-5, built: 2015-05-13)
##
## Working directory is /Users/bbalaji-2012/Documents/Work/Courses/Coursera/text-mining/Assignments/Competition-R
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.16500 -0.75391 0.02749 0.74521 2.37433
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.668e+00 6.390e+00 -0.730 0.465092
## R.npnct19.log 8.155e-01 2.864e-01 2.847 0.004412 **
## R.T.dish 2.928e+00 5.651e+01 0.052 0.958683
## R.npnct07.log 9.751e-01 3.972e-01 2.455 0.014093 *
## R.ndgts.log 1.657e-01 2.723e-01 0.609 0.542749
## R.T.pork 4.530e+01 3.590e+01 1.262 0.207030
## R.T.noodl 5.621e+01 2.592e+01 2.168 0.030148 *
## R.npnct08.log -4.103e-01 3.062e-01 -1.340 0.180204
## R.npnct01.log -1.296e-02 2.763e-01 -0.047 0.962582
## review.niso8859.log 1.602e-01 2.321e-01 0.690 0.489938
## R.nwrds.log -3.393e-01 8.657e-01 -0.392 0.695128
## R.nuppr.log -1.273e+00 7.217e-01 -1.764 0.077701 .
## R.nwrds.unq.log 3.279e+00 1.543e+00 2.125 0.033626 *
## R.nchrs.log -6.635e-01 9.525e-01 -0.697 0.486055
## R.npnct13.log -2.805e-02 5.646e-01 -0.050 0.960375
## R.nstopwrds.log -2.622e-01 8.324e-01 -0.315 0.752791
## R.npnct15.log -9.084e-02 2.971e-01 -0.306 0.759801
## R.npnct11.log -4.421e-01 3.907e-01 -1.132 0.257778
## R.npnct12.log -8.661e-02 3.018e-01 -0.287 0.774116
## R.npnct14.log 4.573e-01 3.078e-01 1.486 0.137332
## R.npnct04.log 1.004e-01 2.977e-01 0.337 0.735862
## R.npnct02.log -2.521e-01 2.254e-01 -1.118 0.263474
## R.T.rice 9.866e+00 3.508e+01 0.281 0.778541
## R.T.thai 7.473e+00 8.210e+00 0.910 0.362647
## R.npnct30.log 6.444e-01 8.191e-01 0.787 0.431453
## R.npnct05.log 1.093e+00 5.385e-01 2.030 0.042352 *
## R.T.dim 1.508e+01 1.378e+01 1.094 0.273760
## R.T.chines 1.094e+01 1.519e+01 0.720 0.471462
## R.npnct09.log 4.417e-01 2.947e-01 1.499 0.133857
## R.npnct03.log -5.299e-02 6.662e-01 -0.080 0.936603
## R.T.roll 5.087e+01 2.653e+01 1.918 0.055162 .
## R.T.chicken 1.440e+01 3.251e+01 0.443 0.657922
## .rnorm 5.897e-01 1.781e-01 3.311 0.000929 ***
## R.npnct18.log -7.596e-01 4.137e-01 -1.836 0.066367 .
## R.T.seattl 3.708e+01 5.159e+01 0.719 0.472337
## R.npnct21.log -9.442e-03 2.160e+00 -0.004 0.996511
## R.T.vietnames 3.344e+00 1.171e+01 0.286 0.775225
## R.T.cheap 1.863e+01 3.026e+01 0.616 0.538098
## R.T.bbq -3.402e+00 2.957e+01 -0.115 0.908415
## R.T.park 2.290e+01 2.480e+01 0.923 0.355878
## R.npnct10.log 3.693e-01 4.309e-01 0.857 0.391518
## R.T.egg -4.401e+00 2.625e+01 -0.168 0.866876
## R.T.tofu 1.551e+01 1.997e+01 0.776 0.437550
## R.T.best 2.046e+01 3.427e+01 0.597 0.550554
## R.P.http -1.818e-01 5.194e-01 -0.350 0.726361
## R.npnct23.log 8.250e-01 2.139e+00 0.386 0.699736
## R.T.soup -3.832e+01 4.211e+01 -0.910 0.362850
## R.T.great 4.809e+01 5.305e+01 0.907 0.364647
## R.npnct16.log -5.282e-01 3.764e-01 -1.403 0.160520
## R.T.get 1.510e+02 6.589e+01 2.292 0.021901 *
## R.T.fri 7.143e+00 2.934e+01 0.243 0.807643
## R.T.pho -3.152e-01 6.144e+00 -0.051 0.959084
## R.T.love -1.715e+01 5.819e+01 -0.295 0.768194
## R.T.food 6.152e+01 8.759e+01 0.702 0.482434
## R.T.time 3.481e+00 8.328e+01 0.042 0.966656
## R.T.restaur 6.127e+00 3.691e+01 0.166 0.868154
## R.npnct06.log 1.044e-01 3.889e-01 0.268 0.788322
## R.T.tri 1.097e+02 5.796e+01 1.892 0.058442 .
## R.T.sauc 4.351e+01 5.101e+01 0.853 0.393698
## R.T.servic 1.401e-01 5.328e+01 0.003 0.997902
## R.T.ive 2.366e+01 4.181e+01 0.566 0.571465
## R.T.can -2.863e+01 7.313e+01 -0.392 0.695411
## R.npnct20.log -1.827e+00 1.152e+00 -1.585 0.112882
## R.T.menu 4.739e+01 4.337e+01 1.093 0.274492
## R.T.lunch 9.334e+00 3.532e+01 0.264 0.791607
## R.P.year.colon 4.452e-02 1.564e+00 0.028 0.977288
## R.T.place 4.064e+01 1.061e+02 0.383 0.701623
## R.T.tabl -1.477e+02 5.646e+01 -2.617 0.008882 **
## R.T.fresh 9.972e+01 4.947e+01 2.016 0.043845 *
## R.T.one 3.884e+01 6.694e+01 0.580 0.561779
## R.T.breakfast -1.840e+01 1.912e+01 -0.962 0.335896
## R.sum.TfIdf 4.526e-01 7.822e-01 0.579 0.562795
## R.T.hour 5.588e+00 6.093e+01 0.092 0.926931
## R.T.tast -7.653e+01 5.512e+01 -1.388 0.165011
## R.T.order 1.471e+01 5.866e+01 0.251 0.801968
## R.T.coffe 3.032e+01 1.747e+01 1.735 0.082672 .
## R.T.night 6.363e-01 4.529e+01 0.014 0.988790
## R.T.taco -1.643e+01 1.981e+01 -0.830 0.406719
## R.T.price -3.715e+01 5.602e+01 -0.663 0.507152
## R.ratio.nstopwrds.nwrds -4.358e+00 8.480e+00 -0.514 0.607336
## R.T.delici -3.245e+01 5.071e+01 -0.640 0.522195
## R.T.friend -1.553e+02 8.120e+01 -1.913 0.055800 .
## R.T.pretti -1.578e+01 5.739e+01 -0.275 0.783385
## R.T.burger -1.488e+01 1.171e+01 -1.272 0.203535
## R.T.sandwich -9.514e+00 1.194e+01 -0.797 0.425502
## R.T.cake -3.860e+01 3.631e+01 -1.063 0.287837
## R.T.realli 1.431e+01 7.417e+01 0.193 0.847042
## R.T.alway -6.832e+01 4.511e+01 -1.514 0.129905
## R.T.good 1.323e+02 1.078e+02 1.228 0.219634
## R.T.just -1.103e+02 8.207e+01 -1.344 0.179009
## R.T.pizza -4.957e+00 5.070e+00 -0.978 0.328172
## R.T.happi -6.861e+01 5.456e+01 -1.257 0.208573
## R.T.beer -1.391e+01 2.536e+01 -0.548 0.583407
## R.T.drink 1.676e+01 2.822e+01 0.594 0.552498
## R.ratio.sum.TfIdf.nwrds 1.751e+00 1.773e+01 0.099 0.921335
## R.T.bar 5.382e+01 2.826e+01 1.905 0.056817 .
## R.T.like -1.814e+02 9.833e+01 -1.845 0.065084 .
## R.T.salad -4.613e+01 2.817e+01 -1.638 0.101510
## R.T.nice 3.926e+01 6.101e+01 0.643 0.519902
## R.T.chees -4.868e-03 2.486e+00 -0.002 0.998438
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 451.93 on 325 degrees of freedom
## Residual deviance: 281.80 on 226 degrees of freedom
## AIC: 481.8
##
## Number of Fisher Scoring iterations: 54
##
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.7139690
## 3 0.2 0.7506053
## 4 0.3 0.7792208
## 5 0.4 0.7887324
## 6 0.5 0.7962963
## 7 0.6 0.7744108
## 8 0.7 0.6768061
## 9 0.8 0.6198347
## 10 0.9 0.4154589
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.5000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.All.X.bayesglm.N
## 1 N 131
## 2 Y 34
## dirty.fctr.predict.All.X.bayesglm.Y
## 1 32
## 2 129
## Prediction
## Reference N Y
## N 131 32
## Y 34 129
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.975460e-01 5.950920e-01 7.497707e-01 8.398213e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 1.136466e-28 9.020346e-01
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6622517
## 3 0.2 0.6425993
## 4 0.3 0.6384615
## 5 0.4 0.6502058
## 6 0.5 0.6315789
## 7 0.6 0.5876777
## 8 0.7 0.5185185
## 9 0.8 0.4800000
## 10 0.9 0.3624161
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.0000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.All.X.bayesglm.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method
## 1 All.X.bayesglm bayesglm
## feats
## 1 R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 2.763 0.664
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.8808009 0.5 0.7962963 0.5951178
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.7497707 0.8398213 0.1902357 0.6305785
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB min.aic.fit
## 1 0.4320488 0.5679512 0 481.8047
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.02343335 0.0468667
## label step_major step_minor bgn end elapsed
## 3 fit.models_1_bayesglm 3 0 161.452 166.857 5.405
## 4 fit.models_1_rpart 4 0 166.858 NA NA
## [1] "fitting model: All.X.no.rnorm.rpart"
## [1] " indep_vars: R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees"
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0307 on full training set
## Warning in myfit_mdl(model_id = model_id, model_method = method,
## indep_vars_vctr = indep_vars_vctr, : model's bestTune found at an extreme
## of tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 326
##
## CP nsplit rel error
## 1 0.22085890 0 1.0000000
## 2 0.10429448 1 0.7791411
## 3 0.03067485 2 0.6748466
##
## Variable importance
## R.T.noodl R.npnct04.log R.ndgts.log
## 22 15 10
## R.ratio.sum.TfIdf.nwrds R.npnct13.log R.nchrs.log
## 9 9 8
## R.nwrds.log R.T.tofu R.T.chines
## 8 4 4
## R.T.soup R.T.thai R.T.rice
## 4 3 3
##
## Node number 1: 326 observations, complexity param=0.2208589
## predicted class=N expected loss=0.5 P(node) =1
## class counts: 163 163
## probabilities: 0.500 0.500
## left son=2 (250 obs) right son=3 (76 obs)
## Primary splits:
## R.T.noodl < 0.0002376254 to the left, improve=11.118320, (0 missing)
## R.T.rice < 0.001351905 to the left, improve=10.388560, (0 missing)
## R.T.tofu < 0.001764125 to the left, improve=10.296180, (0 missing)
## R.nwrds.log < 7.061723 to the left, improve= 9.615009, (0 missing)
## R.T.bbq < 0.003557375 to the left, improve= 9.527261, (0 missing)
## Surrogate splits:
## R.T.tofu < 0.0003902179 to the left, agree=0.813, adj=0.197, (0 split)
## R.T.chines < 0.003191744 to the left, agree=0.804, adj=0.158, (0 split)
## R.T.soup < 0.007752789 to the left, agree=0.804, adj=0.158, (0 split)
## R.T.thai < 0.01523154 to the left, agree=0.801, adj=0.145, (0 split)
## R.T.rice < 0.009207799 to the left, agree=0.794, adj=0.118, (0 split)
##
## Node number 2: 250 observations, complexity param=0.1042945
## predicted class=N expected loss=0.428 P(node) =0.7668712
## class counts: 143 107
## probabilities: 0.572 0.428
## left son=4 (167 obs) right son=5 (83 obs)
## Primary splits:
## R.npnct04.log < 1.242453 to the left, improve=7.559143, (0 missing)
## R.T.rice < 0.001351905 to the left, improve=7.210556, (0 missing)
## R.T.tofu < 0.0006109828 to the left, improve=6.674168, (0 missing)
## R.npnct13.log < 4.197174 to the left, improve=5.899088, (0 missing)
## R.npnct19.log < 2.524928 to the left, improve=5.753920, (0 missing)
## Surrogate splits:
## R.ndgts.log < 2.970086 to the left, agree=0.884, adj=0.651, (0 split)
## R.ratio.sum.TfIdf.nwrds < 0.001913251 to the right, agree=0.868, adj=0.602, (0 split)
## R.npnct13.log < 4.584915 to the left, agree=0.864, adj=0.590, (0 split)
## R.nwrds.log < 7.211163 to the left, agree=0.852, adj=0.554, (0 split)
## R.nchrs.log < 8.929248 to the left, agree=0.852, adj=0.554, (0 split)
##
## Node number 3: 76 observations
## predicted class=Y expected loss=0.2631579 P(node) =0.2331288
## class counts: 20 56
## probabilities: 0.263 0.737
##
## Node number 4: 167 observations
## predicted class=N expected loss=0.3413174 P(node) =0.5122699
## class counts: 110 57
## probabilities: 0.659 0.341
##
## Node number 5: 83 observations
## predicted class=Y expected loss=0.3975904 P(node) =0.2546012
## class counts: 33 50
## probabilities: 0.398 0.602
##
## n= 326
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 326 163 N (0.5000000 0.5000000)
## 2) R.T.noodl< 0.0002376254 250 107 N (0.5720000 0.4280000)
## 4) R.npnct04.log< 1.242453 167 57 N (0.6586826 0.3413174) *
## 5) R.npnct04.log>=1.242453 83 33 Y (0.3975904 0.6024096) *
## 3) R.T.noodl>=0.0002376254 76 20 Y (0.2631579 0.7368421) *
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6666667
## 3 0.2 0.6666667
## 4 0.3 0.6666667
## 5 0.4 0.6583851
## 6 0.5 0.6583851
## 7 0.6 0.6583851
## 8 0.7 0.4686192
## 9 0.8 0.0000000
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rpart.Y
## 1 N 163
## 2 Y 163
## Prediction
## Reference N Y
## N 0 163
## Y 0 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.444047e-01 5.555953e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.220784e-01 6.820624e-37
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6666667
## 3 0.2 0.6666667
## 4 0.3 0.6666667
## 5 0.4 0.5806452
## 6 0.5 0.5806452
## 7 0.6 0.5806452
## 8 0.7 0.4910180
## 9 0.8 0.0000000
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rpart.Y
## 1 N 110
## 2 Y 110
## Prediction
## Reference N Y
## N 0 110
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.000000e-01 0.000000e+00 4.320488e-01 5.679512e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 5.268661e-01 2.675482e-25
## model_id model_method
## 1 All.X.no.rnorm.rpart rpart
## feats
## 1 R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 3 1.422 0.134
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.6785351 0.3 0.6666667 0.5800786
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.4444047 0.5555953 0.1601571 0.6192562
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.6666667 0.5
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4320488 0.5679512 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.04779376 0.09558752
## label step_major step_minor bgn end elapsed
## 4 fit.models_1_rpart 4 0 166.858 171.888 5.03
## 5 fit.models_1_rf 5 0 171.888 NA NA
## [1] "fitting model: All.X.no.rnorm.rf"
## [1] " indep_vars: R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees"
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 50 on full training set
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 326 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 652 matrix numeric
## oob.times 326 -none- numeric
## classes 2 -none- character
## importance 98 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 326 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 98 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6848739
## 3 0.2 0.8907104
## 4 0.3 0.9939024
## 5 0.4 1.0000000
## 6 0.5 1.0000000
## 7 0.6 1.0000000
## 8 0.7 0.9907121
## 9 0.8 0.7970480
## 10 0.9 0.2842105
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.6000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rf.N
## 1 N 163
## 2 Y NA
## dirty.fctr.predict.All.X.no.rnorm.rf.Y
## 1 NA
## 2 163
## Prediction
## Reference N Y
## N 163 0
## Y 0 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 1.000000e+00 1.000000e+00 9.887482e-01 1.000000e+00 5.000000e-01
## AccuracyPValue McnemarPValue
## 7.315119e-99 NaN
## [1] " calling mypredict_mdl for OOB:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6666667
## 3 0.2 0.6646341
## 4 0.3 0.6732026
## 5 0.4 0.6307692
## 6 0.5 0.5687204
## 7 0.6 0.4739884
## 8 0.7 0.3309353
## 9 0.8 0.1196581
## 10 0.9 0.0000000
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.3000 to maximize f.score.OOB"
## dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rf.N
## 1 N 17
## 2 Y 7
## dirty.fctr.predict.All.X.no.rnorm.rf.Y
## 1 93
## 2 103
## Prediction
## Reference N Y
## N 17 93
## Y 7 103
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 5.454545e-01 9.090909e-02 4.771531e-01 6.125145e-01 5.000000e-01
## AccuracyPValue McnemarPValue
## 1.000461e-01 1.895907e-17
## model_id model_method
## 1 All.X.no.rnorm.rf rf
## feats
## 1 R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 3 5.582 1.534
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 1 0.6 1 0.6167228
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## 1 0.9887482 1 0.2334456 0.6240496
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.6732026 0.5454545
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4771531 0.6125145 0.09090909
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.05141116 0.1028223
# User specified
# Ensure at least 2 vars in each regression; else varImp crashes
# sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df; sav_featsimp_df <- glb_featsimp_df
# glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df; glm_featsimp_df <- sav_featsimp_df
# easier to exclude features
#model_id_pfx <- "";
# indep_vars_vctr <- setdiff(names(glb_fitobs_df),
# union(union(glb_rsp_var, glb_exclude_vars_as_features),
# c("<feat1_name>", "<feat2_name>")))
# method <- ""
# easier to include features
# model_id <- "All.Interact.X";
# indep_vars_vctr <- subset(glb_feats_df, !myNearZV &
# (exclude.as.feat != 1))[, "id"]
# indep_vars_vctr <- paste("R.npnct19.log",
# setdiff(indep_vars_vctr, c(".rnorm", "R.npnct19.log")),
# sep="*")
# for (method in c("glm", "bayesglm")) {
# ret_lst <- myfit_mdl(model_id=model_id, model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df)
# csm_mdl_id <- paste0(model_id, ".", method)
# csm_featsimp_df <- myget_feats_importance(glb_models_lst[[
# paste0(model_id, ".", method)]])
# print(head(csm_featsimp_df))
# }
#print(dsp_models_df <- orderBy(model_sel_frmla, glb_models_df)[, dsp_models_cols])
#csm_featsimp_df[grepl("H.npnct19.log", row.names(csm_featsimp_df)), , FALSE]
#csm_OOBobs_df <- glb_get_predictions(glb_OOBobs_df, mdl_id=csm_mdl_id, rsp_var_out=glb_rsp_var_out, prob_threshold_def=glb_models_df[glb_models_df$model_id == csm_mdl_id, "opt.prob.threshold.OOB"])
#print(sprintf("%s OOB confusion matrix & accuracy: ", csm_mdl_id)); print(t(confusionMatrix(csm_OOBobs_df[, paste0(glb_rsp_var_out, csm_mdl_id)], csm_OOBobs_df[, glb_rsp_var])$table))
#glb_models_df[, "max.Accuracy.OOB", FALSE]
#varImp(glb_models_lst[["Low.cor.X.glm"]])
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.2.glm"]])$importance)
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.3.glm"]])$importance)
#glb_feats_df[grepl("npnct28", glb_feats_df$id), ]
#print(sprintf("%s OOB confusion matrix & accuracy: ", glb_sel_mdl_id)); print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)], glb_OOBobs_df[, glb_rsp_var])$table))
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glb_fitobs_df),
# union(glb_rsp_var, glb_exclude_vars_as_features)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(model_id=paste0(model_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_n_cv_folds, tune_models_df=glb_tune_models_df,
# model_loss_mtrx=glb_model_metric_terms,
# model_summaryFunction=glb_model_metric_smmry,
# model_metric=glb_model_metric,
# model_metric_maximize=glb_model_metric_maximize)
# Simplify a model
# fit_df <- glb_fitobs_df; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glb_fitobs_df, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glb_model_metric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## model_id model_method
## MFO.myMFO_classfr MFO.myMFO_classfr myMFO_classfr
## Random.myrandom_classfr Random.myrandom_classfr myrandom_classfr
## Max.cor.Y.cv.0.rpart Max.cor.Y.cv.0.rpart rpart
## Max.cor.Y.cv.0.cp.0.rpart Max.cor.Y.cv.0.cp.0.rpart rpart
## Max.cor.Y.rpart Max.cor.Y.rpart rpart
## Max.cor.Y.glm Max.cor.Y.glm glm
## Interact.High.cor.Y.glm Interact.High.cor.Y.glm glm
## Low.cor.X.glm Low.cor.X.glm glm
## All.X.glm All.X.glm glm
## All.X.bayesglm All.X.bayesglm bayesglm
## All.X.no.rnorm.rpart All.X.no.rnorm.rpart rpart
## All.X.no.rnorm.rf All.X.no.rnorm.rf rf
## feats
## MFO.myMFO_classfr .rnorm
## Random.myrandom_classfr .rnorm
## Max.cor.Y.cv.0.rpart R.npnct19.log, R.T.dish
## Max.cor.Y.cv.0.cp.0.rpart R.npnct19.log, R.T.dish
## Max.cor.Y.rpart R.npnct19.log, R.T.dish
## Max.cor.Y.glm R.npnct19.log, R.T.dish
## Interact.High.cor.Y.glm R.npnct19.log, R.T.dish, R.npnct19.log:R.npnct19.log, R.npnct19.log:R.npnct07.log, R.npnct19.log:R.nwrds.log, R.npnct19.log:R.T.sum
## Low.cor.X.glm R.npnct19.log, R.T.dish, R.T.pork, R.T.noodl, R.npnct14.log, R.npnct04.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.glm R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.bayesglm R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.no.rnorm.rpart R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.no.rnorm.rf R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns min.elapsedtime.everything
## MFO.myMFO_classfr 0 0.462
## Random.myrandom_classfr 0 0.255
## Max.cor.Y.cv.0.rpart 0 0.617
## Max.cor.Y.cv.0.cp.0.rpart 0 0.470
## Max.cor.Y.rpart 3 0.972
## Max.cor.Y.glm 1 0.931
## Interact.High.cor.Y.glm 1 0.922
## Low.cor.X.glm 1 1.187
## All.X.glm 1 1.355
## All.X.bayesglm 1 2.763
## All.X.no.rnorm.rpart 3 1.422
## All.X.no.rnorm.rf 3 5.582
## min.elapsedtime.final max.auc.fit
## MFO.myMFO_classfr 0.002 0.5000000
## Random.myrandom_classfr 0.001 0.5000000
## Max.cor.Y.cv.0.rpart 0.014 0.5000000
## Max.cor.Y.cv.0.cp.0.rpart 0.018 0.7135383
## Max.cor.Y.rpart 0.014 0.6865708
## Max.cor.Y.glm 0.014 0.6527532
## Interact.High.cor.Y.glm 0.015 0.6851782
## Low.cor.X.glm 0.122 0.8547555
## All.X.glm 0.156 0.8962701
## All.X.bayesglm 0.664 0.8808009
## All.X.no.rnorm.rpart 0.134 0.6785351
## All.X.no.rnorm.rf 1.534 1.0000000
## opt.prob.threshold.fit max.f.score.fit
## MFO.myMFO_classfr 0.5 0.6666667
## Random.myrandom_classfr 0.5 0.6666667
## Max.cor.Y.cv.0.rpart 0.5 0.6666667
## Max.cor.Y.cv.0.cp.0.rpart 0.4 0.6705202
## Max.cor.Y.rpart 0.3 0.6694387
## Max.cor.Y.glm 0.4 0.6732673
## Interact.High.cor.Y.glm 0.4 0.6787565
## Low.cor.X.glm 0.4 0.7840909
## All.X.glm 0.3 0.8160000
## All.X.bayesglm 0.5 0.7962963
## All.X.no.rnorm.rpart 0.3 0.6666667
## All.X.no.rnorm.rf 0.6 1.0000000
## max.Accuracy.fit max.AccuracyLower.fit
## MFO.myMFO_classfr 0.5000000 0.4444047
## Random.myrandom_classfr 0.5000000 0.4444047
## Max.cor.Y.cv.0.rpart 0.5000000 0.4444047
## Max.cor.Y.cv.0.cp.0.rpart 0.6503067 0.5958226
## Max.cor.Y.rpart 0.5672840 0.4565784
## Max.cor.Y.glm 0.6010101 0.5396076
## Interact.High.cor.Y.glm 0.6228395 0.5645033
## Low.cor.X.glm 0.5924242 0.7171251
## All.X.glm 0.5705387 0.7399391
## All.X.bayesglm 0.5951178 0.7497707
## All.X.no.rnorm.rpart 0.5800786 0.4444047
## All.X.no.rnorm.rf 0.6167228 0.9887482
## max.AccuracyUpper.fit max.Kappa.fit max.auc.OOB
## MFO.myMFO_classfr 0.5555953 0.0000000 0.5000000
## Random.myrandom_classfr 0.5555953 0.0000000 0.5000000
## Max.cor.Y.cv.0.rpart 0.5555953 0.0000000 0.5000000
## Max.cor.Y.cv.0.cp.0.rpart 0.7020312 0.3006135 0.6429752
## Max.cor.Y.rpart 0.5677366 0.1345679 0.6256612
## Max.cor.Y.glm 0.6488323 0.2020202 0.6812810
## Interact.High.cor.Y.glm 0.6725654 0.2456790 0.6892975
## Low.cor.X.glm 0.8116947 0.1848485 0.6214876
## All.X.glm 0.8314218 0.1410774 0.6292562
## All.X.bayesglm 0.8398213 0.1902357 0.6305785
## All.X.no.rnorm.rpart 0.5555953 0.1601571 0.6192562
## All.X.no.rnorm.rf 1.0000000 0.2334456 0.6240496
## opt.prob.threshold.OOB max.f.score.OOB
## MFO.myMFO_classfr 0.5 0.6666667
## Random.myrandom_classfr 0.5 0.6666667
## Max.cor.Y.cv.0.rpart 0.5 0.6666667
## Max.cor.Y.cv.0.cp.0.rpart 0.2 0.6666667
## Max.cor.Y.rpart 0.2 0.6666667
## Max.cor.Y.glm 0.3 0.6666667
## Interact.High.cor.Y.glm 0.4 0.6865672
## Low.cor.X.glm 0.0 0.6666667
## All.X.glm 0.0 0.6666667
## All.X.bayesglm 0.0 0.6666667
## All.X.no.rnorm.rpart 0.3 0.6666667
## All.X.no.rnorm.rf 0.3 0.6732026
## max.Accuracy.OOB max.AccuracyLower.OOB
## MFO.myMFO_classfr 0.5000000 0.4320488
## Random.myrandom_classfr 0.5000000 0.4320488
## Max.cor.Y.cv.0.rpart 0.5000000 0.4320488
## Max.cor.Y.cv.0.cp.0.rpart 0.5000000 0.4320488
## Max.cor.Y.rpart 0.5000000 0.4320488
## Max.cor.Y.glm 0.5000000 0.4320488
## Interact.High.cor.Y.glm 0.6181818 0.5504658
## Low.cor.X.glm 0.5000000 0.4320488
## All.X.glm 0.5000000 0.4320488
## All.X.bayesglm 0.5000000 0.4320488
## All.X.no.rnorm.rpart 0.5000000 0.4320488
## All.X.no.rnorm.rf 0.5454545 0.4771531
## max.AccuracyUpper.OOB max.Kappa.OOB
## MFO.myMFO_classfr 0.5679512 0.00000000
## Random.myrandom_classfr 0.5679512 0.00000000
## Max.cor.Y.cv.0.rpart 0.5679512 0.00000000
## Max.cor.Y.cv.0.cp.0.rpart 0.5679512 0.00000000
## Max.cor.Y.rpart 0.5679512 0.00000000
## Max.cor.Y.glm 0.5679512 0.00000000
## Interact.High.cor.Y.glm 0.6826667 0.23636364
## Low.cor.X.glm 0.5679512 0.00000000
## All.X.glm 0.5679512 0.00000000
## All.X.bayesglm 0.5679512 0.00000000
## All.X.no.rnorm.rpart 0.5679512 0.00000000
## All.X.no.rnorm.rf 0.6125145 0.09090909
## max.AccuracySD.fit max.KappaSD.fit min.aic.fit
## MFO.myMFO_classfr NA NA NA
## Random.myrandom_classfr NA NA NA
## Max.cor.Y.cv.0.rpart NA NA NA
## Max.cor.Y.cv.0.cp.0.rpart NA NA NA
## Max.cor.Y.rpart 0.03658677 0.07317354 NA
## Max.cor.Y.glm 0.03578184 0.07156367 434.0906
## Interact.High.cor.Y.glm 0.06335210 0.12670420 429.2393
## Low.cor.X.glm 0.06506750 0.13013501 475.0190
## All.X.glm 0.01401762 0.02803523 462.2375
## All.X.bayesglm 0.02343335 0.04686670 481.8047
## All.X.no.rnorm.rpart 0.04779376 0.09558752 NA
## All.X.no.rnorm.rf 0.05141116 0.10282231 NA
rm(ret_lst)
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end",
major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 5 fit.models_1_rf 5 0 171.888 180.577 8.69
## 6 fit.models_1_end 6 0 180.578 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 11 fit.models 7 1 153.799 180.585 26.786
## 12 fit.models 7 2 180.586 NA NA
if (!is.null(glb_model_metric_smmry)) {
stats_df <- glb_models_df[, "model_id", FALSE]
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_fitobs_df, glb_rsp_var,
glb_rsp_var_out, model_id, "fit",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
stats_mdl_df <- data.frame()
for (model_id in stats_df$model_id) {
stats_mdl_df <- rbind(stats_mdl_df,
mypredict_mdl(glb_models_lst[[model_id]], glb_OOBobs_df, glb_rsp_var,
glb_rsp_var_out, model_id, "OOB",
glb_model_metric_smmry, glb_model_metric,
glb_model_metric_maximize, ret_type="stats"))
}
stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
print("Merging following data into glb_models_df:")
print(stats_mrg_df <- stats_df[, c(1, grep(glb_model_metric, names(stats_df)))])
print(tmp_models_df <- orderBy(~model_id, glb_models_df[, c("model_id",
grep(glb_model_metric, names(stats_df), value=TRUE))]))
tmp2_models_df <- glb_models_df[, c("model_id", setdiff(names(glb_models_df),
grep(glb_model_metric, names(stats_df), value=TRUE)))]
tmp3_models_df <- merge(tmp2_models_df, stats_mrg_df, all.x=TRUE, sort=FALSE)
print(tmp3_models_df)
print(names(tmp3_models_df))
print(glb_models_df <- subset(tmp3_models_df, select=-model_id.1))
}
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## model_id model_method
## MFO.myMFO_classfr MFO.myMFO_classfr myMFO_classfr
## Random.myrandom_classfr Random.myrandom_classfr myrandom_classfr
## Max.cor.Y.cv.0.rpart Max.cor.Y.cv.0.rpart rpart
## Max.cor.Y.cv.0.cp.0.rpart Max.cor.Y.cv.0.cp.0.rpart rpart
## Max.cor.Y.rpart Max.cor.Y.rpart rpart
## Max.cor.Y.glm Max.cor.Y.glm glm
## Interact.High.cor.Y.glm Interact.High.cor.Y.glm glm
## Low.cor.X.glm Low.cor.X.glm glm
## All.X.glm All.X.glm glm
## All.X.bayesglm All.X.bayesglm bayesglm
## All.X.no.rnorm.rpart All.X.no.rnorm.rpart rpart
## All.X.no.rnorm.rf All.X.no.rnorm.rf rf
## feats
## MFO.myMFO_classfr .rnorm
## Random.myrandom_classfr .rnorm
## Max.cor.Y.cv.0.rpart R.npnct19.log, R.T.dish
## Max.cor.Y.cv.0.cp.0.rpart R.npnct19.log, R.T.dish
## Max.cor.Y.rpart R.npnct19.log, R.T.dish
## Max.cor.Y.glm R.npnct19.log, R.T.dish
## Interact.High.cor.Y.glm R.npnct19.log, R.T.dish, R.npnct19.log:R.npnct19.log, R.npnct19.log:R.npnct07.log, R.npnct19.log:R.nwrds.log, R.npnct19.log:R.T.sum
## Low.cor.X.glm R.npnct19.log, R.T.dish, R.T.pork, R.T.noodl, R.npnct14.log, R.npnct04.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.glm R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.bayesglm R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, .rnorm, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.no.rnorm.rpart R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## All.X.no.rnorm.rf R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns max.auc.fit
## MFO.myMFO_classfr 0 0.5000000
## Random.myrandom_classfr 0 0.5000000
## Max.cor.Y.cv.0.rpart 0 0.5000000
## Max.cor.Y.cv.0.cp.0.rpart 0 0.7135383
## Max.cor.Y.rpart 3 0.6865708
## Max.cor.Y.glm 1 0.6527532
## Interact.High.cor.Y.glm 1 0.6851782
## Low.cor.X.glm 1 0.8547555
## All.X.glm 1 0.8962701
## All.X.bayesglm 1 0.8808009
## All.X.no.rnorm.rpart 3 0.6785351
## All.X.no.rnorm.rf 3 1.0000000
## opt.prob.threshold.fit max.f.score.fit
## MFO.myMFO_classfr 0.5 0.6666667
## Random.myrandom_classfr 0.5 0.6666667
## Max.cor.Y.cv.0.rpart 0.5 0.6666667
## Max.cor.Y.cv.0.cp.0.rpart 0.4 0.6705202
## Max.cor.Y.rpart 0.3 0.6694387
## Max.cor.Y.glm 0.4 0.6732673
## Interact.High.cor.Y.glm 0.4 0.6787565
## Low.cor.X.glm 0.4 0.7840909
## All.X.glm 0.3 0.8160000
## All.X.bayesglm 0.5 0.7962963
## All.X.no.rnorm.rpart 0.3 0.6666667
## All.X.no.rnorm.rf 0.6 1.0000000
## max.Accuracy.fit max.Kappa.fit max.auc.OOB
## MFO.myMFO_classfr 0.5000000 0.0000000 0.5000000
## Random.myrandom_classfr 0.5000000 0.0000000 0.5000000
## Max.cor.Y.cv.0.rpart 0.5000000 0.0000000 0.5000000
## Max.cor.Y.cv.0.cp.0.rpart 0.6503067 0.3006135 0.6429752
## Max.cor.Y.rpart 0.5672840 0.1345679 0.6256612
## Max.cor.Y.glm 0.6010101 0.2020202 0.6812810
## Interact.High.cor.Y.glm 0.6228395 0.2456790 0.6892975
## Low.cor.X.glm 0.5924242 0.1848485 0.6214876
## All.X.glm 0.5705387 0.1410774 0.6292562
## All.X.bayesglm 0.5951178 0.1902357 0.6305785
## All.X.no.rnorm.rpart 0.5800786 0.1601571 0.6192562
## All.X.no.rnorm.rf 0.6167228 0.2334456 0.6240496
## opt.prob.threshold.OOB max.f.score.OOB
## MFO.myMFO_classfr 0.5 0.6666667
## Random.myrandom_classfr 0.5 0.6666667
## Max.cor.Y.cv.0.rpart 0.5 0.6666667
## Max.cor.Y.cv.0.cp.0.rpart 0.2 0.6666667
## Max.cor.Y.rpart 0.2 0.6666667
## Max.cor.Y.glm 0.3 0.6666667
## Interact.High.cor.Y.glm 0.4 0.6865672
## Low.cor.X.glm 0.0 0.6666667
## All.X.glm 0.0 0.6666667
## All.X.bayesglm 0.0 0.6666667
## All.X.no.rnorm.rpart 0.3 0.6666667
## All.X.no.rnorm.rf 0.3 0.6732026
## max.Accuracy.OOB max.Kappa.OOB
## MFO.myMFO_classfr 0.5000000 0.00000000
## Random.myrandom_classfr 0.5000000 0.00000000
## Max.cor.Y.cv.0.rpart 0.5000000 0.00000000
## Max.cor.Y.cv.0.cp.0.rpart 0.5000000 0.00000000
## Max.cor.Y.rpart 0.5000000 0.00000000
## Max.cor.Y.glm 0.5000000 0.00000000
## Interact.High.cor.Y.glm 0.6181818 0.23636364
## Low.cor.X.glm 0.5000000 0.00000000
## All.X.glm 0.5000000 0.00000000
## All.X.bayesglm 0.5000000 0.00000000
## All.X.no.rnorm.rpart 0.5000000 0.00000000
## All.X.no.rnorm.rf 0.5454545 0.09090909
## inv.elapsedtime.everything inv.elapsedtime.final
## MFO.myMFO_classfr 2.1645022 500.0000000
## Random.myrandom_classfr 3.9215686 1000.0000000
## Max.cor.Y.cv.0.rpart 1.6207455 71.4285714
## Max.cor.Y.cv.0.cp.0.rpart 2.1276596 55.5555556
## Max.cor.Y.rpart 1.0288066 71.4285714
## Max.cor.Y.glm 1.0741139 71.4285714
## Interact.High.cor.Y.glm 1.0845987 66.6666667
## Low.cor.X.glm 0.8424600 8.1967213
## All.X.glm 0.7380074 6.4102564
## All.X.bayesglm 0.3619254 1.5060241
## All.X.no.rnorm.rpart 0.7032349 7.4626866
## All.X.no.rnorm.rf 0.1791473 0.6518905
## inv.aic.fit
## MFO.myMFO_classfr NA
## Random.myrandom_classfr NA
## Max.cor.Y.cv.0.rpart NA
## Max.cor.Y.cv.0.cp.0.rpart NA
## Max.cor.Y.rpart NA
## Max.cor.Y.glm 0.002303667
## Interact.High.cor.Y.glm 0.002329703
## Low.cor.X.glm 0.002105179
## All.X.glm 0.002163390
## All.X.bayesglm 0.002075530
## All.X.no.rnorm.rpart NA
## All.X.no.rnorm.rf NA
print(myplot_radar(radar_inp_df=plt_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
## Warning in loop_apply(n, do.ply): Removed 4 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 87 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 7 rows containing missing values
## (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(model_id %in% grep("random|MFO", plt_models_df$model_id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "model_id", FALSE]
pltCI_models_df <- glb_models_df[, "model_id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="model_id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="model_id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("model_id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("model_id", "model_method")],
all.x=TRUE)
png(paste0(glb_out_pfx, "models_bar.png"), width=480*3, height=480*2)
print(gp <- myplot_bar(mltd_models_df, "model_id", "value", colorcol_name="model_method") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=model_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
dev.off()
## quartz_off_screen
## 2
print(gp)
# used for console inspection
model_evl_terms <- c(NULL)
for (metric in glb_model_evl_criteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse=" "))
dsp_models_cols <- c("model_id", glb_model_evl_criteria)
if (glb_is_classification && glb_is_binomial)
dsp_models_cols <- c(dsp_models_cols, "opt.prob.threshold.OOB")
print(dsp_models_df <- orderBy(model_sel_frmla, glb_models_df)[, dsp_models_cols])
## model_id max.Accuracy.OOB max.auc.OOB max.Kappa.OOB
## 7 Interact.High.cor.Y.glm 0.6181818 0.6892975 0.23636364
## 12 All.X.no.rnorm.rf 0.5454545 0.6240496 0.09090909
## 6 Max.cor.Y.glm 0.5000000 0.6812810 0.00000000
## 4 Max.cor.Y.cv.0.cp.0.rpart 0.5000000 0.6429752 0.00000000
## 10 All.X.bayesglm 0.5000000 0.6305785 0.00000000
## 9 All.X.glm 0.5000000 0.6292562 0.00000000
## 5 Max.cor.Y.rpart 0.5000000 0.6256612 0.00000000
## 8 Low.cor.X.glm 0.5000000 0.6214876 0.00000000
## 11 All.X.no.rnorm.rpart 0.5000000 0.6192562 0.00000000
## 1 MFO.myMFO_classfr 0.5000000 0.5000000 0.00000000
## 2 Random.myrandom_classfr 0.5000000 0.5000000 0.00000000
## 3 Max.cor.Y.cv.0.rpart 0.5000000 0.5000000 0.00000000
## min.aic.fit opt.prob.threshold.OOB
## 7 429.2393 0.4
## 12 NA 0.3
## 6 434.0906 0.3
## 4 NA 0.2
## 10 481.8047 0.0
## 9 462.2375 0.0
## 5 NA 0.2
## 8 475.0190 0.0
## 11 NA 0.3
## 1 NA 0.5
## 2 NA 0.5
## 3 NA 0.5
print(myplot_radar(radar_inp_df=dsp_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
## Warning in loop_apply(n, do.ply): Removed 38 rows containing missing values
## (geom_point).
## Warning in loop_apply(n, do.ply): Removed 7 rows containing missing values
## (geom_text).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 12. Consider specifying shapes manually. if you must have them.
print("Metrics used for model selection:"); print(model_sel_frmla)
## [1] "Metrics used for model selection:"
## ~-max.Accuracy.OOB - max.auc.OOB - max.Kappa.OOB + min.aic.fit -
## opt.prob.threshold.OOB
print(sprintf("Best model id: %s", dsp_models_df[1, "model_id"]))
## [1] "Best model id: Interact.High.cor.Y.glm"
if (is.null(glb_sel_mdl_id)) {
glb_sel_mdl_id <- dsp_models_df[1, "model_id"]
if (glb_sel_mdl_id == "Interact.High.cor.Y.glm") {
warning("glb_sel_mdl_id: Interact.High.cor.Y.glm; myextract_mdl_feats does not currently support interaction terms")
glb_sel_mdl_id <- dsp_models_df[2, "model_id"]
}
} else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
## Warning: glb_sel_mdl_id: Interact.High.cor.Y.glm; myextract_mdl_feats does
## not currently support interaction terms
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 326 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 652 matrix numeric
## oob.times 326 -none- numeric
## classes 2 -none- character
## importance 98 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 326 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 98 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] TRUE
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
glb_get_predictions <- function(df, mdl_id, rsp_var_out, prob_threshold_def=NULL) {
mdl <- glb_models_lst[[mdl_id]]
rsp_var_out <- paste0(rsp_var_out, mdl_id)
if (glb_is_regression) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
print(myplot_scatter(df, glb_rsp_var, rsp_var_out, smooth=TRUE))
df[, paste0(rsp_var_out, ".err")] <-
abs(df[, rsp_var_out] - df[, glb_rsp_var])
print(head(orderBy(reformulate(c("-", paste0(rsp_var_out, ".err"))),
df)))
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$model_id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, paste0(rsp_var_out, ".prob")] <-
predict(mdl, newdata=df, type="prob")[, 2]
df[, rsp_var_out] <-
factor(levels(df[, glb_rsp_var])[
(df[, paste0(rsp_var_out, ".prob")] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# prediction stats already reported by myfit_mdl ???
}
if (glb_is_classification && !glb_is_binomial) {
df[, rsp_var_out] <- predict(mdl, newdata=df, type="raw")
df[, paste0(rsp_var_out, ".prob")] <-
predict(mdl, newdata=df, type="prob")
}
return(df)
}
glb_OOBobs_df <- glb_get_predictions(df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id,
rsp_var_out=glb_rsp_var_out)
predct_accurate_var_name <- paste0(glb_rsp_var_out, glb_sel_mdl_id, ".accurate")
glb_OOBobs_df[, predct_accurate_var_name] <-
(glb_OOBobs_df[, glb_rsp_var] ==
glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)])
#stop(here"); #sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df
glb_featsimp_df <-
myget_feats_importance(mdl=glb_sel_mdl, featsimp_df=NULL)
glb_featsimp_df[, paste0(glb_sel_mdl_id, ".importance")] <- glb_featsimp_df$importance
print(glb_featsimp_df)
## importance All.X.no.rnorm.rf.importance
## R.T.like 100.0000000 100.0000000
## R.T.noodl 87.2391755 87.2391755
## R.T.just 79.4737855 79.4737855
## R.T.get 70.5792463 70.5792463
## R.ratio.nstopwrds.nwrds 63.6378014 63.6378014
## R.T.cheap 62.1777579 62.1777579
## R.sum.TfIdf 62.1093972 62.1093972
## R.T.friend 62.0619954 62.0619954
## R.T.tri 61.6269833 61.6269833
## R.T.food 60.4651889 60.4651889
## R.T.seattl 59.3238509 59.3238509
## R.T.roll 57.2911371 57.2911371
## R.T.tofu 56.2902664 56.2902664
## R.T.rice 55.6705172 55.6705172
## R.T.bbq 51.6988580 51.6988580
## R.T.ive 51.5467404 51.5467404
## R.T.order 49.1963338 49.1963338
## R.npnct19.log 46.6818734 46.6818734
## R.T.pretti 45.4391869 45.4391869
## R.T.one 44.5314559 44.5314559
## R.T.pork 44.0180925 44.0180925
## R.T.place 41.9281640 41.9281640
## R.T.servic 40.5746306 40.5746306
## R.T.menu 40.3916423 40.3916423
## R.T.time 39.2630311 39.2630311
## R.nwrds.log 38.4495825 38.4495825
## R.T.love 38.0948489 38.0948489
## R.T.tabl 37.8743449 37.8743449
## R.T.happi 37.8723562 37.8723562
## R.T.good 37.8082349 37.8082349
## R.T.restaur 36.9685854 36.9685854
## R.T.alway 36.9492266 36.9492266
## R.nwrds.unq.log 35.3268064 35.3268064
## R.T.fresh 34.1930178 34.1930178
## R.T.best 33.6561292 33.6561292
## review.niso8859.log 32.2958745 32.2958745
## R.T.realli 31.6891565 31.6891565
## R.T.drink 30.6058139 30.6058139
## R.T.can 30.1658680 30.1658680
## R.T.sauc 30.0404279 30.0404279
## R.T.great 29.5534482 29.5534482
## R.T.nice 29.0062734 29.0062734
## R.T.night 28.1817483 28.1817483
## R.npnct04.log 27.8521563 27.8521563
## R.T.fri 27.6661168 27.6661168
## R.T.price 27.5329395 27.5329395
## R.npnct13.log 27.2218975 27.2218975
## R.npnct01.log 27.1122732 27.1122732
## R.npnct11.log 26.8258200 26.8258200
## R.T.tast 26.7182134 26.7182134
## R.nuppr.log 26.5430417 26.5430417
## R.npnct07.log 25.3999403 25.3999403
## R.T.park 25.1221080 25.1221080
## R.T.salad 24.8395623 24.8395623
## R.T.chicken 24.6102709 24.6102709
## R.T.chines 24.5195179 24.5195179
## R.nchrs.log 24.4698101 24.4698101
## R.T.sandwich 23.3680114 23.3680114
## R.T.delici 23.1777526 23.1777526
## R.T.breakfast 22.6899122 22.6899122
## R.T.beer 22.0190978 22.0190978
## R.npnct14.log 21.9373706 21.9373706
## R.T.burger 21.9055347 21.9055347
## R.ndgts.log 20.9795332 20.9795332
## R.npnct08.log 20.0983473 20.0983473
## R.T.lunch 19.8535901 19.8535901
## R.nstopwrds.log 19.8169736 19.8169736
## R.ratio.sum.TfIdf.nwrds 19.4284228 19.4284228
## R.npnct02.log 19.1196937 19.1196937
## R.T.dish 19.1161843 19.1161843
## R.T.bar 18.5872813 18.5872813
## R.T.egg 17.8743005 17.8743005
## R.npnct15.log 17.6629993 17.6629993
## R.T.hour 17.4009869 17.4009869
## R.npnct12.log 17.3685469 17.3685469
## R.T.vietnames 17.2491207 17.2491207
## R.T.chees 16.5754593 16.5754593
## R.npnct05.log 13.6125943 13.6125943
## R.npnct09.log 12.6094553 12.6094553
## R.npnct16.log 12.4578606 12.4578606
## R.T.dim 12.4375623 12.4375623
## R.T.coffe 11.7575129 11.7575129
## R.T.thai 10.8314481 10.8314481
## R.T.soup 10.6342134 10.6342134
## R.npnct10.log 8.9861660 8.9861660
## R.npnct18.log 8.9477566 8.9477566
## R.T.pizza 8.7267748 8.7267748
## R.T.cake 7.5497676 7.5497676
## R.npnct06.log 7.2579223 7.2579223
## R.npnct20.log 7.1816657 7.1816657
## R.T.pho 6.9997138 6.9997138
## R.T.taco 6.5677101 6.5677101
## R.npnct03.log 4.9536167 4.9536167
## R.P.http 2.6886638 2.6886638
## R.npnct30.log 2.0875313 2.0875313
## R.npnct23.log 1.0369798 1.0369798
## R.P.year.colon 0.2572221 0.2572221
## R.npnct21.log 0.0000000 0.0000000
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
featsimp_df <- glb_featsimp_df
featsimp_df$feat <- gsub("`(.*?)`", "\\1", row.names(featsimp_df))
featsimp_df$feat.interact <- gsub("(.*?):(.*)", "\\2", featsimp_df$feat)
featsimp_df$feat <- gsub("(.*?):(.*)", "\\1", featsimp_df$feat)
featsimp_df$feat.interact <- ifelse(featsimp_df$feat.interact == featsimp_df$feat,
NA, featsimp_df$feat.interact)
featsimp_df$feat <- gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat)
featsimp_df$feat.interact <- gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat.interact)
featsimp_df <- orderBy(~ -importance.max, summaryBy(importance ~ feat + feat.interact,
data=featsimp_df, FUN=max))
#rex_str=":(.*)"; txt_vctr=tail(featsimp_df$feat); ret_lst <- regexec(rex_str, txt_vctr); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
if (nrow(featsimp_df) > 5) {
warning("Limiting important feature scatter plots to 5 out of ", nrow(featsimp_df))
featsimp_df <- head(featsimp_df, 5)
}
# if (!all(is.na(featsimp_df$feat.interact)))
# stop("not implemented yet")
rsp_var_out <- paste0(glb_rsp_var_out, mdl_id)
for (var in featsimp_df$feat) {
plot_df <- melt(obs_df, id.vars=var,
measure.vars=c(glb_rsp_var, rsp_var_out))
# if (var == "<feat_name>") print(myplot_scatter(plot_df, var, "value",
# facet_colcol_name="variable") +
# geom_vline(xintercept=<divider_val>, linetype="dotted")) else
print(myplot_scatter(plot_df, var, "value", colorcol_name="variable",
facet_colcol_name="variable", jitter=TRUE) +
guides(color=FALSE))
}
if (glb_is_regression) {
if (nrow(featsimp_df) == 0)
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glb_id_var)
# + facet_wrap(reformulate(featsimp_df$feat[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (nrow(featsimp_df) == 0)
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var,
rsp_var_out=rsp_var_out,
id_vars=glb_id_var,
prob_threshold=prob_threshold)
# + geom_hline(yintercept=<divider_val>, linetype = "dotted")
)
}
}
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id,
prob_threshold=glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glb_OOBobs_df, mdl_id =
## glb_sel_mdl_id, : Limiting important feature scatter plots to 5 out of 98
## [1] "Min/Max Boundaries: "
## .rownames dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rf.prob
## 215 215 Y 0.160
## 217 217 Y 0.232
## 151 151 Y 0.262
## 117 117 Y 0.288
## 101 101 Y 0.346
## 263 263 Y 0.556
## 276 276 N 0.304
## 330 330 N 0.330
## 331 331 N 0.342
## 339 339 N 0.368
## 442 442 N 0.408
## 324 324 N 0.434
## 275 275 N 0.450
## 313 313 N 0.540
## dirty.fctr.predict.All.X.no.rnorm.rf
## 215 N
## 217 N
## 151 N
## 117 N
## 101 Y
## 263 Y
## 276 Y
## 330 Y
## 331 Y
## 339 Y
## 442 Y
## 324 Y
## 275 Y
## 313 Y
## dirty.fctr.predict.All.X.no.rnorm.rf.accurate
## 215 FALSE
## 217 FALSE
## 151 FALSE
## 117 FALSE
## 101 TRUE
## 263 TRUE
## 276 FALSE
## 330 FALSE
## 331 FALSE
## 339 FALSE
## 442 FALSE
## 324 FALSE
## 275 FALSE
## 313 FALSE
## dirty.fctr.predict.All.X.no.rnorm.rf.error .label
## 215 -0.140 215
## 217 -0.068 217
## 151 -0.038 151
## 117 -0.012 117
## 101 0.000 101
## 263 0.000 263
## 276 0.004 276
## 330 0.030 330
## 331 0.042 331
## 339 0.068 339
## 442 0.108 442
## 324 0.134 324
## 275 0.150 275
## 313 0.240 313
## [1] "Inaccurate: "
## .rownames dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rf.prob
## 215 215 Y 0.160
## 59 59 Y 0.222
## 217 217 Y 0.232
## 87 87 Y 0.256
## 151 151 Y 0.262
## 69 69 Y 0.272
## dirty.fctr.predict.All.X.no.rnorm.rf
## 215 N
## 59 N
## 217 N
## 87 N
## 151 N
## 69 N
## dirty.fctr.predict.All.X.no.rnorm.rf.accurate
## 215 FALSE
## 59 FALSE
## 217 FALSE
## 87 FALSE
## 151 FALSE
## 69 FALSE
## dirty.fctr.predict.All.X.no.rnorm.rf.error
## 215 -0.140
## 59 -0.078
## 217 -0.068
## 87 -0.044
## 151 -0.038
## 69 -0.028
## .rownames dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rf.prob
## 520 520 N 0.332
## 425 425 N 0.438
## 401 401 N 0.442
## 536 536 N 0.460
## 526 526 N 0.468
## 545 545 N 0.554
## dirty.fctr.predict.All.X.no.rnorm.rf
## 520 Y
## 425 Y
## 401 Y
## 536 Y
## 526 Y
## 545 Y
## dirty.fctr.predict.All.X.no.rnorm.rf.accurate
## 520 FALSE
## 425 FALSE
## 401 FALSE
## 536 FALSE
## 526 FALSE
## 545 FALSE
## dirty.fctr.predict.All.X.no.rnorm.rf.error
## 520 0.032
## 425 0.138
## 401 0.142
## 536 0.160
## 526 0.168
## 545 0.254
## .rownames dirty.fctr dirty.fctr.predict.All.X.no.rnorm.rf.prob
## 529 529 N 0.726
## 458 458 N 0.728
## 431 431 N 0.746
## 410 410 N 0.752
## 380 380 N 0.770
## 470 470 N 0.788
## dirty.fctr.predict.All.X.no.rnorm.rf
## 529 Y
## 458 Y
## 431 Y
## 410 Y
## 380 Y
## 470 Y
## dirty.fctr.predict.All.X.no.rnorm.rf.accurate
## 529 FALSE
## 458 FALSE
## 431 FALSE
## 410 FALSE
## 380 FALSE
## 470 FALSE
## dirty.fctr.predict.All.X.no.rnorm.rf.error
## 529 0.426
## 458 0.428
## 431 0.446
## 410 0.452
## 380 0.470
## 470 0.488
# gather predictions from models better than MFO.*
#mdl_id <- "Conditional.X.rf"
#mdl_id <- "Conditional.X.cp.0.rpart"
#mdl_id <- "Conditional.X.rpart"
# glb_OOBobs_df <- glb_get_predictions(df=glb_OOBobs_df, mdl_id,
# glb_rsp_var_out)
# print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, mdl_id)],
# glb_OOBobs_df[, glb_rsp_var])$table))
# FN_OOB_ids <- c(4721, 4020, 693, 92)
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# glb_feats_df$id[1:5]])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
write.csv(glb_OOBobs_df[, c(glb_id_var,
grep(glb_rsp_var, names(glb_OOBobs_df), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBobs.csv"), row.names=FALSE)
# print(glb_allobs_df[glb_allobs_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
# dsp_tbl(Headline.contains="[Ee]bola")
# sum(sel_obs(Headline.contains="[Ee]bola"))
# ftable(xtabs(Popular ~ NewsDesk.fctr, data=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,]))
# xtabs(NewsDesk ~ Popular, #Popular ~ NewsDesk.fctr,
# data=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,],
# exclude=NULL)
# print(mycreate_xtab_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular", "NewsDesk", "SectionName", "SubsectionName")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,], c("Popular")))
# print(mycreate_tbl_df(df=glb_allobs_df[sel_obs(Headline.contains="[Ee]bola") ,],
# tbl_col_names=c("Popular", "NewsDesk")))
# write.csv(glb_chunks_df, paste0(glb_out_pfx, tail(glb_chunks_df, 1)$label, "_",
# tail(glb_chunks_df, 1)$step_minor, "_chunks1.csv"),
# row.names=FALSE)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 12 fit.models 7 2 180.586 194.49 13.904
## 13 fit.models 7 3 194.491 NA NA
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## [1] "R.T.banh" "R.T.deli"
## [3] "R.T.deliveri" "R.T.ethiopian"
## [5] "R.T.falafel" "R.T.mexican"
## [7] "R.T.philli" "R.T.sandwhich"
## [9] "R.T.sum" "R.T.sushi"
## [11] "R.T.teriyaki" "R.npnct17.log"
## [13] "R.npnct22.log" "R.npnct24.log"
## [15] "R.npnct25.log" "R.npnct26.log"
## [17] "R.npnct27.log" "R.npnct28.log"
## [19] "R.npnct29.log" "R.P.daily.clip.report"
## [21] "R.P.fashion.week" "R.P.first.draft"
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## [1] "R.T.banh" "R.T.deli"
## [3] "R.T.deliveri" "R.T.ethiopian"
## [5] "R.T.falafel" "R.T.mexican"
## [7] "R.T.philli" "R.T.sandwhich"
## [9] "R.T.sum" "R.T.sushi"
## [11] "R.T.teriyaki" "R.npnct17.log"
## [13] "R.npnct22.log" "R.npnct24.log"
## [15] "R.npnct25.log" "R.npnct26.log"
## [17] "R.npnct27.log" "R.npnct28.log"
## [19] "R.npnct29.log" "R.P.daily.clip.report"
## [21] "R.P.fashion.week" "R.P.first.draft"
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## [1] "R.T.banh"
## [2] "R.T.deli"
## [3] "R.T.deliveri"
## [4] "R.T.ethiopian"
## [5] "R.T.falafel"
## [6] "R.T.mexican"
## [7] "R.T.philli"
## [8] "R.T.sandwhich"
## [9] "R.T.sum"
## [10] "R.T.sushi"
## [11] "R.T.teriyaki"
## [12] "R.npnct17.log"
## [13] "R.npnct22.log"
## [14] "R.npnct24.log"
## [15] "R.npnct25.log"
## [16] "R.npnct26.log"
## [17] "R.npnct27.log"
## [18] "R.npnct28.log"
## [19] "R.npnct29.log"
## [20] "R.P.daily.clip.report"
## [21] "R.P.fashion.week"
## [22] "R.P.first.draft"
## [23] "dirty.fctr.predict.All.X.no.rnorm.rf.prob"
## [24] "dirty.fctr.predict.All.X.no.rnorm.rf"
## [25] "dirty.fctr.predict.All.X.no.rnorm.rf.accurate"
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_sel_mdl, glb_sel_mdl_id,
glb_model_type,
file=paste0(glb_out_pfx, "selmdl_dsk.RData"))
#load(paste0(glb_out_pfx, "selmdl_dsk.RData"))
rm(ret_lst)
## Warning in rm(ret_lst): object 'ret_lst' not found
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 13 fit.models 7 3 194.491 198.62 4.129
## 14 fit.data.training 8 0 198.621 NA NA
8.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
# To create specific models
# glb_fin_mdl_id <- NULL; glb_fin_mdl <- NULL;
# glb_sel_mdl_id <- "Conditional.X.cp.0.rpart";
# glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]]; print(glb_sel_mdl)
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_sel_mdl
} else {
# print(mdl_feats_df <- myextract_mdl_feats(sel_mdl=glb_sel_mdl,
# entity_df=glb_fitobs_df))
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the model_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
tune_finmdl_df <- NULL
if (nrow(glb_sel_mdl$bestTune) > 0) {
for (param in names(glb_sel_mdl$bestTune)) {
#print(sprintf("param: %s", param))
if (glb_sel_mdl$bestTune[1, param] != "none")
tune_finmdl_df <- rbind(tune_finmdl_df,
data.frame(parameter=param,
min=glb_sel_mdl$bestTune[1, param],
max=glb_sel_mdl$bestTune[1, param],
by=1)) # by val does not matter
}
}
# Sync with parameters in mydsutils.R
require(gdata)
ret_lst <- myfit_mdl(model_id="Final", model_method=model_method,
indep_vars_vctr=trim(unlist(strsplit(glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"feats"], "[,]"))),
model_type=glb_model_type,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_trnobs_df, OOB_df=NULL,
n_cv_folds=glb_n_cv_folds, tune_models_df=tune_finmdl_df,
# Automate from here
# Issues if glb_sel_mdl$method == "rf" b/c trainControl is "oob"; not "cv"
model_loss_mtrx=glb_model_metric_terms,
model_summaryFunction=glb_sel_mdl$control$summaryFunction,
model_metric=glb_sel_mdl$metric,
model_metric_maximize=glb_sel_mdl$maximize)
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "model_id"]
}
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
## [1] "fitting model: Final.rf"
## [1] " indep_vars: R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees"
## Aggregating results
## Fitting final model on full training set
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 546 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1092 matrix numeric
## oob.times 546 -none- numeric
## classes 2 -none- character
## importance 98 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 546 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 98 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] " calling mypredict_mdl for fit:"
## threshold f.score
## 1 0.0 0.6666667
## 2 0.1 0.6893939
## 3 0.2 0.8950820
## 4 0.3 0.9945355
## 5 0.4 1.0000000
## 6 0.5 1.0000000
## 7 0.6 1.0000000
## 8 0.7 0.9851301
## 9 0.8 0.7973568
## 10 0.9 0.2776025
## 11 1.0 0.0000000
## [1] "Classifier Probability Threshold: 0.6000 to maximize f.score.fit"
## dirty.fctr dirty.fctr.predict.Final.rf.N dirty.fctr.predict.Final.rf.Y
## 1 N 273 NA
## 2 Y NA 273
## Prediction
## Reference N Y
## N 273 0
## Y 0 273
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 1.000000e+00 1.000000e+00 9.932666e-01 1.000000e+00 5.000000e-01
## AccuracyPValue McnemarPValue
## 4.341326e-165 NaN
## Warning in mypredict_mdl(mdl, df = fit_df, rsp_var, rsp_var_out,
## model_id_method, : Expecting 1 metric: Accuracy; recd: Accuracy, Kappa;
## retaining Accuracy only
## model_id model_method
## 1 Final.rf rf
## feats
## 1 R.npnct19.log, R.T.dish, R.npnct07.log, R.ndgts.log, R.T.pork, R.T.noodl, R.npnct08.log, R.npnct01.log, review.niso8859.log, R.nwrds.log, R.nuppr.log, R.nwrds.unq.log, R.nchrs.log, R.npnct13.log, R.nstopwrds.log, R.npnct15.log, R.npnct11.log, R.npnct12.log, R.npnct14.log, R.npnct04.log, R.npnct02.log, R.T.rice, R.T.thai, R.npnct30.log, R.npnct05.log, R.T.dim, R.T.chines, R.npnct09.log, R.npnct03.log, R.T.roll, R.T.chicken, R.npnct18.log, R.T.seattl, R.npnct21.log, R.T.vietnames, R.T.cheap, R.T.bbq, R.T.park, R.npnct10.log, R.T.egg, R.T.tofu, R.T.best, R.P.http, R.npnct23.log, R.T.soup, R.T.great, R.npnct16.log, R.T.get, R.T.fri, R.T.pho, R.T.love, R.T.food, R.T.time, R.T.restaur, R.npnct06.log, R.T.tri, R.T.sauc, R.T.servic, R.T.ive, R.T.can, R.npnct20.log, R.T.menu, R.T.lunch, R.P.year.colon, R.T.place, R.T.tabl, R.T.fresh, R.T.one, R.T.breakfast, R.sum.TfIdf, R.T.hour, R.T.tast, R.T.order, R.T.coffe, R.T.night, R.T.taco, R.T.price, R.ratio.nstopwrds.nwrds, R.T.delici, R.T.friend, R.T.pretti, R.T.burger, R.T.sandwich, R.T.cake, R.T.realli, R.T.alway, R.T.good, R.T.just, R.T.pizza, R.T.happi, R.T.beer, R.T.drink, R.ratio.sum.TfIdf.nwrds, R.T.bar, R.T.like, R.T.salad, R.T.nice, R.T.chees
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 5.781 2.979
## max.auc.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 1 0.6 1 0.6245421
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9932666 1 0.2490842
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008392996 0.01678599
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor bgn end elapsed
## 14 fit.data.training 8 0 198.621 207.78 9.159
## 15 fit.data.training 8 1 207.781 NA NA
glb_trnobs_df <- glb_get_predictions(df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id, "opt.prob.threshold.OOB"], NULL))
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## glb_fin_mdl_id, : Using default probability threshold: 0.3
sav_featsimp_df <- glb_featsimp_df
#glb_feats_df <- sav_feats_df
# glb_feats_df <- mymerge_feats_importance(feats_df=glb_feats_df, sel_mdl=glb_fin_mdl,
# entity_df=glb_trnobs_df)
glb_featsimp_df <- myget_feats_importance(mdl=glb_fin_mdl, featsimp_df=glb_featsimp_df)
glb_featsimp_df[, paste0(glb_fin_mdl_id, ".importance")] <- glb_featsimp_df$importance
print(glb_featsimp_df)
## All.X.no.rnorm.rf.importance importance
## R.T.noodl 87.2391755 100.0000000
## R.T.get 70.5792463 78.9768336
## R.T.pork 44.0180925 69.9214630
## R.npnct19.log 46.6818734 68.8253565
## R.T.rice 55.6705172 60.7182663
## R.T.like 100.0000000 60.4500930
## R.T.dish 19.1161843 57.0674188
## R.ratio.nstopwrds.nwrds 63.6378014 55.8824383
## R.T.food 60.4651889 52.9449292
## R.T.salad 24.8395623 48.7225772
## R.sum.TfIdf 62.1093972 45.0313804
## R.T.place 41.9281640 44.7149910
## R.T.cheap 62.1777579 44.2980060
## R.T.great 29.5534482 42.9594181
## R.T.tabl 37.8743449 42.9190218
## R.T.tofu 56.2902664 41.4012469
## R.T.time 39.2630311 40.7059783
## R.T.menu 40.3916423 40.3838335
## R.T.one 44.5314559 40.2779375
## R.T.price 27.5329395 40.2378141
## R.T.just 79.4737855 39.9219860
## R.T.best 33.6561292 39.4008782
## R.T.chicken 24.6102709 38.8696419
## R.T.good 37.8082349 38.8112758
## R.T.restaur 36.9685854 38.8034540
## R.T.order 49.1963338 36.0541317
## R.npnct07.log 25.3999403 35.7441725
## R.T.seattl 59.3238509 35.6773067
## R.T.nice 29.0062734 34.5815621
## R.T.tri 61.6269833 34.4652260
## R.T.servic 40.5746306 33.8852363
## R.T.pretti 45.4391869 32.0693406
## R.T.friend 62.0619954 31.6373694
## R.T.love 38.0948489 31.6234295
## R.T.park 25.1221080 31.5755882
## R.T.alway 36.9492266 30.2606057
## R.T.ive 51.5467404 29.6509864
## R.T.delici 23.1777526 29.1585977
## R.T.realli 31.6891565 28.4019116
## R.T.night 28.1817483 27.1912146
## R.T.fresh 34.1930178 26.8829679
## R.T.tast 26.7182134 26.8616418
## R.nwrds.unq.log 35.3268064 25.4898715
## R.T.can 30.1658680 25.4484740
## R.T.fri 27.6661168 25.3880015
## R.T.sauc 30.0404279 25.2762115
## R.T.happi 37.8723562 24.6272909
## R.T.roll 57.2911371 24.5711925
## R.npnct13.log 27.2218975 24.0822878
## R.T.lunch 19.8535901 23.8120407
## R.ndgts.log 20.9795332 23.5583553
## review.niso8859.log 32.2958745 23.3887868
## R.T.drink 30.6058139 22.5228282
## R.npnct15.log 17.6629993 21.6705666
## R.T.chees 16.5754593 21.4997647
## R.nuppr.log 26.5430417 20.8114704
## R.npnct01.log 27.1122732 20.4208664
## R.npnct11.log 26.8258200 19.8114895
## R.npnct12.log 17.3685469 19.3124814
## R.npnct05.log 13.6125943 18.8013987
## R.npnct08.log 20.0983473 17.8707357
## R.T.sandwich 23.3680114 17.5672777
## R.T.bar 18.5872813 17.5493325
## R.nstopwrds.log 19.8169736 17.1525070
## R.T.vietnames 17.2491207 16.9367746
## R.T.egg 17.8743005 16.7503516
## R.nchrs.log 24.4698101 16.5240877
## R.T.bbq 51.6988580 16.4806085
## R.T.beer 22.0190978 16.3650168
## R.T.soup 10.6342134 16.2998420
## R.ratio.sum.TfIdf.nwrds 19.4284228 16.2513277
## R.nwrds.log 38.4495825 16.0287221
## R.T.hour 17.4009869 15.8353105
## R.T.breakfast 22.6899122 15.8135681
## R.npnct02.log 19.1196937 14.5323465
## R.npnct16.log 12.4578606 14.4455455
## R.T.thai 10.8314481 13.3508461
## R.T.pizza 8.7267748 13.0754000
## R.T.coffe 11.7575129 12.9027876
## R.npnct04.log 27.8521563 12.8486190
## R.npnct14.log 21.9373706 11.7492076
## R.npnct06.log 7.2579223 11.0715889
## R.T.dim 12.4375623 10.7721673
## R.T.cake 7.5497676 10.0017191
## R.T.taco 6.5677101 9.8349681
## R.T.pho 6.9997138 9.5527083
## R.T.chines 24.5195179 9.5077012
## R.T.burger 21.9055347 9.4028868
## R.npnct30.log 2.0875313 8.9625966
## R.npnct09.log 12.6094553 8.8467477
## R.npnct18.log 8.9477566 4.2821133
## R.npnct10.log 8.9861660 3.9784033
## R.npnct03.log 4.9536167 2.9998291
## R.P.http 2.6886638 2.2426837
## R.npnct20.log 7.1816657 1.1877966
## R.npnct23.log 1.0369798 0.8501092
## R.P.year.colon 0.2572221 0.7676892
## R.npnct21.log 0.0000000 0.0000000
## Final.rf.importance
## R.T.noodl 100.0000000
## R.T.get 78.9768336
## R.T.pork 69.9214630
## R.npnct19.log 68.8253565
## R.T.rice 60.7182663
## R.T.like 60.4500930
## R.T.dish 57.0674188
## R.ratio.nstopwrds.nwrds 55.8824383
## R.T.food 52.9449292
## R.T.salad 48.7225772
## R.sum.TfIdf 45.0313804
## R.T.place 44.7149910
## R.T.cheap 44.2980060
## R.T.great 42.9594181
## R.T.tabl 42.9190218
## R.T.tofu 41.4012469
## R.T.time 40.7059783
## R.T.menu 40.3838335
## R.T.one 40.2779375
## R.T.price 40.2378141
## R.T.just 39.9219860
## R.T.best 39.4008782
## R.T.chicken 38.8696419
## R.T.good 38.8112758
## R.T.restaur 38.8034540
## R.T.order 36.0541317
## R.npnct07.log 35.7441725
## R.T.seattl 35.6773067
## R.T.nice 34.5815621
## R.T.tri 34.4652260
## R.T.servic 33.8852363
## R.T.pretti 32.0693406
## R.T.friend 31.6373694
## R.T.love 31.6234295
## R.T.park 31.5755882
## R.T.alway 30.2606057
## R.T.ive 29.6509864
## R.T.delici 29.1585977
## R.T.realli 28.4019116
## R.T.night 27.1912146
## R.T.fresh 26.8829679
## R.T.tast 26.8616418
## R.nwrds.unq.log 25.4898715
## R.T.can 25.4484740
## R.T.fri 25.3880015
## R.T.sauc 25.2762115
## R.T.happi 24.6272909
## R.T.roll 24.5711925
## R.npnct13.log 24.0822878
## R.T.lunch 23.8120407
## R.ndgts.log 23.5583553
## review.niso8859.log 23.3887868
## R.T.drink 22.5228282
## R.npnct15.log 21.6705666
## R.T.chees 21.4997647
## R.nuppr.log 20.8114704
## R.npnct01.log 20.4208664
## R.npnct11.log 19.8114895
## R.npnct12.log 19.3124814
## R.npnct05.log 18.8013987
## R.npnct08.log 17.8707357
## R.T.sandwich 17.5672777
## R.T.bar 17.5493325
## R.nstopwrds.log 17.1525070
## R.T.vietnames 16.9367746
## R.T.egg 16.7503516
## R.nchrs.log 16.5240877
## R.T.bbq 16.4806085
## R.T.beer 16.3650168
## R.T.soup 16.2998420
## R.ratio.sum.TfIdf.nwrds 16.2513277
## R.nwrds.log 16.0287221
## R.T.hour 15.8353105
## R.T.breakfast 15.8135681
## R.npnct02.log 14.5323465
## R.npnct16.log 14.4455455
## R.T.thai 13.3508461
## R.T.pizza 13.0754000
## R.T.coffe 12.9027876
## R.npnct04.log 12.8486190
## R.npnct14.log 11.7492076
## R.npnct06.log 11.0715889
## R.T.dim 10.7721673
## R.T.cake 10.0017191
## R.T.taco 9.8349681
## R.T.pho 9.5527083
## R.T.chines 9.5077012
## R.T.burger 9.4028868
## R.npnct30.log 8.9625966
## R.npnct09.log 8.8467477
## R.npnct18.log 4.2821133
## R.npnct10.log 3.9784033
## R.npnct03.log 2.9998291
## R.P.http 2.2426837
## R.npnct20.log 1.1877966
## R.npnct23.log 0.8501092
## R.P.year.colon 0.7676892
## R.npnct21.log 0.0000000
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glb_trnobs_df, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 98
## [1] "Min/Max Boundaries: "
## .rownames dirty.fctr dirty.fctr.predict.Final.rf.prob
## 1 1 Y 0.870
## 101 101 Y 0.732
## 191 191 Y 0.704
## 263 263 Y 0.852
## 302 302 N 0.316
## dirty.fctr.predict.Final.rf dirty.fctr.predict.Final.rf.accurate
## 1 Y TRUE
## 101 Y TRUE
## 191 Y TRUE
## 263 Y TRUE
## 302 Y FALSE
## dirty.fctr.predict.Final.rf.error .label
## 1 0.000 1
## 101 0.000 101
## 191 0.000 191
## 263 0.000 263
## 302 0.016 302
## [1] "Inaccurate: "
## .rownames dirty.fctr dirty.fctr.predict.Final.rf.prob
## 345 345 N 0.314
## 302 302 N 0.316
## 380 380 N 0.342
## dirty.fctr.predict.Final.rf dirty.fctr.predict.Final.rf.accurate
## 345 Y FALSE
## 302 Y FALSE
## 380 Y FALSE
## dirty.fctr.predict.Final.rf.error
## 345 0.014
## 302 0.016
## 380 0.042
dsp_feats_vctr <- c(NULL)
for(var in grep(".importance", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glb_trnobs_df[glb_trnobs_df$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glb_trnobs_df), value=TRUE)])
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## [1] "dirty.fctr.predict.Final.rf.prob" "dirty.fctr.predict.Final.rf"
for (col in setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.src == "Train", col] <- glb_trnobs_df[, col]
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## character(0)
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## character(0)
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 15 fit.data.training 8 1 207.781 212.355 4.574
## 16 predict.data.new 9 0 212.356 NA NA
9.0: predict data new# Compute final model predictions
glb_newobs_df <- glb_get_predictions(glb_newobs_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=ifelse(glb_is_classification && glb_is_binomial,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"], NULL))
## Warning in glb_get_predictions(glb_newobs_df, mdl_id = glb_fin_mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.3
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glb_newobs_df, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_newobs_df, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glb_newobs_df, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 98
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): no non-missing arguments to min;
## returning Inf
## Warning in loop_apply(n, do.ply): no non-missing arguments to max;
## returning -Inf
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
## [1] "Min/Max Boundaries: "
## .rownames dirty.fctr dirty.fctr.predict.Final.rf.prob
## 547 547 <NA> 0.324
## 551 551 <NA> 0.274
## 588 588 <NA> 0.278
## 699 699 <NA> 0.554
## dirty.fctr.predict.Final.rf dirty.fctr.predict.Final.rf.accurate
## 547 Y NA
## 551 N NA
## 588 N NA
## 699 Y NA
## dirty.fctr.predict.Final.rf.error .label
## 547 0 547
## 551 0 551
## 588 0 588
## 699 0 699
## [1] "Inaccurate: "
## .rownames dirty.fctr dirty.fctr.predict.Final.rf.prob
## NA <NA> <NA> NA
## NA.1 <NA> <NA> NA
## NA.2 <NA> <NA> NA
## NA.3 <NA> <NA> NA
## NA.4 <NA> <NA> NA
## NA.5 <NA> <NA> NA
## dirty.fctr.predict.Final.rf dirty.fctr.predict.Final.rf.accurate
## NA <NA> NA
## NA.1 <NA> NA
## NA.2 <NA> NA
## NA.3 <NA> NA
## NA.4 <NA> NA
## NA.5 <NA> NA
## dirty.fctr.predict.Final.rf.error
## NA NA
## NA.1 NA
## NA.2 NA
## NA.3 NA
## NA.4 NA
## NA.5 NA
## .rownames dirty.fctr dirty.fctr.predict.Final.rf.prob
## NA.17 <NA> <NA> NA
## NA.46 <NA> <NA> NA
## NA.126 <NA> <NA> NA
## NA.132 <NA> <NA> NA
## NA.163 <NA> <NA> NA
## NA.169 <NA> <NA> NA
## dirty.fctr.predict.Final.rf dirty.fctr.predict.Final.rf.accurate
## NA.17 <NA> NA
## NA.46 <NA> NA
## NA.126 <NA> NA
## NA.132 <NA> NA
## NA.163 <NA> NA
## NA.169 <NA> NA
## dirty.fctr.predict.Final.rf.error
## NA.17 NA
## NA.46 NA
## NA.126 NA
## NA.132 NA
## NA.163 NA
## NA.169 NA
## .rownames dirty.fctr dirty.fctr.predict.Final.rf.prob
## NA.194 <NA> <NA> NA
## NA.195 <NA> <NA> NA
## NA.196 <NA> <NA> NA
## NA.197 <NA> <NA> NA
## NA.198 <NA> <NA> NA
## NA.199 <NA> <NA> NA
## dirty.fctr.predict.Final.rf dirty.fctr.predict.Final.rf.accurate
## NA.194 <NA> NA
## NA.195 <NA> NA
## NA.196 <NA> NA
## NA.197 <NA> NA
## NA.198 <NA> NA
## NA.199 <NA> NA
## dirty.fctr.predict.Final.rf.error
## NA.194 NA
## NA.195 NA
## NA.196 NA
## NA.197 NA
## NA.198 NA
## NA.199 NA
## Warning in loop_apply(n, do.ply): Removed 200 rows containing missing
## values (geom_point).
if (glb_is_classification && glb_is_binomial) {
# submit_df <- glb_newobs_df[, c(glb_id_var,
# paste0(glb_rsp_var_out, glb_fin_mdl_id, ".prob"))]
# names(submit_df)[2] <- "Probability1"
submit_df <- glb_newobs_df[, c(paste0(glb_rsp_var_out, glb_fin_mdl_id)), FALSE]
names(submit_df)[1] <- "BDscience"
submit_df$BDscience <- as.numeric(submit_df$BDscience) - 1
#submit_df <-rbind(submit_df, data.frame(bdanalytics=c(" ")))
print("Submission Stats:")
print(table(submit_df$BDscience, useNA = "ifany"))
} else submit_df <- glb_newobs_df[, c(glb_id_var,
paste0(glb_rsp_var_out, glb_fin_mdl_id))]
## [1] "Submission Stats:"
##
## 0 1
## 27 173
submit_fname <- paste0(gsub(".", "_", paste0(glb_out_pfx, glb_fin_mdl_id), fixed=TRUE),
"_submit.csv")
write.csv(submit_df, submit_fname, quote=FALSE, row.names=FALSE)
#cat(" ", "\n", file=submit_fn, append=TRUE)
# print(orderBy(~ -max.auc.OOB, glb_models_df[, c("model_id",
# "max.auc.OOB", "max.Accuracy.OOB")]))
if (glb_is_classification && glb_is_binomial)
print(glb_models_df[glb_models_df$model_id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"])
## [1] 0.3
print(sprintf("glb_sel_mdl_id: %s", glb_sel_mdl_id))
## [1] "glb_sel_mdl_id: All.X.no.rnorm.rf"
print(sprintf("glb_fin_mdl_id: %s", glb_fin_mdl_id))
## [1] "glb_fin_mdl_id: Final.rf"
print(dim(glb_fitobs_df))
## [1] 326 127
print(dsp_models_df)
## model_id max.Accuracy.OOB max.auc.OOB max.Kappa.OOB
## 7 Interact.High.cor.Y.glm 0.6181818 0.6892975 0.23636364
## 12 All.X.no.rnorm.rf 0.5454545 0.6240496 0.09090909
## 6 Max.cor.Y.glm 0.5000000 0.6812810 0.00000000
## 4 Max.cor.Y.cv.0.cp.0.rpart 0.5000000 0.6429752 0.00000000
## 10 All.X.bayesglm 0.5000000 0.6305785 0.00000000
## 9 All.X.glm 0.5000000 0.6292562 0.00000000
## 5 Max.cor.Y.rpart 0.5000000 0.6256612 0.00000000
## 8 Low.cor.X.glm 0.5000000 0.6214876 0.00000000
## 11 All.X.no.rnorm.rpart 0.5000000 0.6192562 0.00000000
## 1 MFO.myMFO_classfr 0.5000000 0.5000000 0.00000000
## 2 Random.myrandom_classfr 0.5000000 0.5000000 0.00000000
## 3 Max.cor.Y.cv.0.rpart 0.5000000 0.5000000 0.00000000
## min.aic.fit opt.prob.threshold.OOB
## 7 429.2393 0.4
## 12 NA 0.3
## 6 434.0906 0.3
## 4 NA 0.2
## 10 481.8047 0.0
## 9 462.2375 0.0
## 5 NA 0.2
## 8 475.0190 0.0
## 11 NA 0.3
## 1 NA 0.5
## 2 NA 0.5
## 3 NA 0.5
if (glb_is_regression) {
print(sprintf("%s OOB RMSE: %0.4f", glb_sel_mdl_id,
glb_models_df[glb_models_df$model_id == glb_sel_mdl_id, "min.RMSE.OOB"]))
if (!is.null(glb_category_vars)) {
stop("not implemented yet")
tmp_OOBobs_df <- glb_OOBobs_df[, c(glb_category_vars, predct_accurate_var_name)]
names(tmp_OOBobs_df)[length(names(tmp_OOBobs_df))] <- "accurate.OOB"
aOOB_ctgry_df <- mycreate_xtab_df(tmp_OOBobs_df, names(tmp_OOBobs_df))
aOOB_ctgry_df[is.na(aOOB_ctgry_df)] <- 0
aOOB_ctgry_df <- mutate(aOOB_ctgry_df,
.n.OOB = accurate.OOB.FALSE + accurate.OOB.TRUE,
max.accuracy.OOB = accurate.OOB.TRUE / .n.OOB)
#intersect(names(glb_ctgry_df), names(aOOB_ctgry_df))
glb_ctgry_df <- merge(glb_ctgry_df, aOOB_ctgry_df, all=TRUE)
print(orderBy(~-accurate.OOB.FALSE, glb_ctgry_df))
}
if ((glb_rsp_var %in% names(glb_newobs_df)) &&
!(any(is.na(glb_newobs_df[, glb_rsp_var])))) {
pred_stats_df <-
mypredict_mdl(mdl=glb_models_lst[[glb_fin_mdl_id]],
df=glb_newobs_df,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
model_id_method=glb_fin_mdl_id,
label="new",
model_summaryFunction=glb_sel_mdl$control$summaryFunction,
model_metric=glb_sel_mdl$metric,
model_metric_maximize=glb_sel_mdl$maximize,
ret_type="stats")
print(sprintf("%s prediction stats for glb_newobs_df:", glb_fin_mdl_id))
print(pred_stats_df)
}
}
if (glb_is_classification) {
print(sprintf("%s OOB confusion matrix & accuracy: ", glb_sel_mdl_id))
print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBobs_df[, glb_rsp_var])$table))
if (!is.null(glb_category_vars)) {
tmp_OOBobs_df <- glb_OOBobs_df[, c(glb_category_vars, predct_accurate_var_name)]
names(tmp_OOBobs_df)[length(names(tmp_OOBobs_df))] <- "accurate.OOB"
aOOB_ctgry_df <- mycreate_xtab_df(tmp_OOBobs_df, names(tmp_OOBobs_df))
aOOB_ctgry_df[is.na(aOOB_ctgry_df)] <- 0
aOOB_ctgry_df <- mutate(aOOB_ctgry_df,
.n.OOB = accurate.OOB.FALSE + accurate.OOB.TRUE,
max.accuracy.OOB = accurate.OOB.TRUE / .n.OOB)
#intersect(names(glb_ctgry_df), names(aOOB_ctgry_df))
glb_ctgry_df <- merge(glb_ctgry_df, aOOB_ctgry_df, all=TRUE)
print(orderBy(~-accurate.OOB.FALSE, glb_ctgry_df))
}
if ((glb_rsp_var %in% names(glb_newobs_df)) &&
!(any(is.na(glb_newobs_df[, glb_rsp_var])))) {
print(sprintf("%s new confusion matrix & accuracy: ", glb_fin_mdl_id))
print(t(confusionMatrix(glb_newobs_df[, paste0(glb_rsp_var_out, glb_fin_mdl_id)],
glb_newobs_df[, glb_rsp_var])$table))
}
}
## [1] "All.X.no.rnorm.rf OOB confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 17 93
## Y 7 103
dsp_myCategory_conf_mtrx <- function(myCategory) {
print(sprintf("%s OOB::myCategory=%s confusion matrix & accuracy: ",
glb_sel_mdl_id, myCategory))
print(t(confusionMatrix(
glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory,
paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory, glb_rsp_var])$table))
print(sum(glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory,
predct_accurate_var_name]) /
nrow(glb_OOBobs_df[glb_OOBobs_df$myCategory == myCategory, ]))
err_ids <- glb_OOBobs_df[(glb_OOBobs_df$myCategory == myCategory) &
(!glb_OOBobs_df[, predct_accurate_var_name]), glb_id_var]
OOB_FNerr_df <- glb_OOBobs_df[(glb_OOBobs_df$UniqueID %in% err_ids) &
(glb_OOBobs_df$Popular == 1),
c(
".clusterid",
"Popular", "Headline", "Snippet", "Abstract")]
print(sprintf("%s OOB::myCategory=%s FN errors: %d", glb_sel_mdl_id, myCategory,
nrow(OOB_FNerr_df)))
print(OOB_FNerr_df)
OOB_FPerr_df <- glb_OOBobs_df[(glb_OOBobs_df$UniqueID %in% err_ids) &
(glb_OOBobs_df$Popular == 0),
c(
".clusterid",
"Popular", "Headline", "Snippet", "Abstract")]
print(sprintf("%s OOB::myCategory=%s FP errors: %d", glb_sel_mdl_id, myCategory,
nrow(OOB_FPerr_df)))
print(OOB_FPerr_df)
}
#dsp_myCategory_conf_mtrx(myCategory="OpEd#Opinion#")
#dsp_myCategory_conf_mtrx(myCategory="Business#Business Day#Dealbook")
#dsp_myCategory_conf_mtrx(myCategory="##")
# if (glb_is_classification) {
# print("FN_OOB_ids:")
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# glb_txt_vars])
# print(dsp_vctr <- colSums(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# setdiff(grep("[HSA].", names(glb_OOBobs_df), value=TRUE),
# union(myfind_chr_cols_df(glb_OOBobs_df),
# grep(".fctr", names(glb_OOBobs_df), fixed=TRUE, value=TRUE)))]))
# }
dsp_hdlpfx_results <- function(hdlpfx) {
print(hdlpfx)
print(glb_OOBobs_df[glb_OOBobs_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
print(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
grep(glb_rsp_var, names(glb_newobs_df), value=TRUE)])
print(dsp_vctr <- colSums(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
setdiff(grep("[HSA]\\.", names(glb_newobs_df), value=TRUE),
union(myfind_chr_cols_df(glb_newobs_df),
grep(".fctr", names(glb_newobs_df), fixed=TRUE, value=TRUE)))]))
print(dsp_vctr <- dsp_vctr[dsp_vctr != 0])
print(glb_newobs_df[glb_newobs_df$Headline.pfx %in% c(hdlpfx),
union(names(dsp_vctr), myfind_chr_cols_df(glb_newobs_df))])
}
#dsp_hdlpfx_results(hdlpfx="Ask Well::")
# print("myMisc::|OpEd|blank|blank|1:")
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% c(6446),
# grep(glb_rsp_var, names(glb_OOBobs_df), value=TRUE)])
# print(glb_OOBobs_df[glb_OOBobs_df$UniqueID %in% FN_OOB_ids,
# c("WordCount", "WordCount.log", "myMultimedia",
# "NewsDesk", "SectionName", "SubsectionName")])
# print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains="[Vv]ideo"), ],
# c(glb_rsp_var, "myMultimedia")))
# dsp_chisq.test(Headline.contains="[Vi]deo")
# print(glb_allobs_df[sel_obs(Headline.contains="[Vv]ideo"),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline")])
# print(glb_allobs_df[sel_obs(Headline.contains="[Ee]bola", Popular=1),
# c(glb_rsp_var, "Popular", "myMultimedia", "Headline",
# "NewsDesk", "SectionName", "SubsectionName")])
# print(subset(glb_feats_df, !is.na(importance))[,
# c("is.ConditionalX.y",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
# print(subset(glb_feats_df, is.ConditionalX.y & is.na(importance))[,
# c("is.ConditionalX.y",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
# print(subset(glb_feats_df, !is.na(importance))[,
# c("zeroVar", "nzv", "myNearZV",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
# print(subset(glb_feats_df, is.na(importance))[,
# c("zeroVar", "nzv", "myNearZV",
# grep("importance", names(glb_feats_df), fixed=TRUE, value=TRUE))])
print(orderBy(as.formula(paste0("~ -", glb_sel_mdl_id, ".importance")), glb_featsimp_df))
## All.X.no.rnorm.rf.importance importance
## R.T.like 100.0000000 60.4500930
## R.T.noodl 87.2391755 100.0000000
## R.T.just 79.4737855 39.9219860
## R.T.get 70.5792463 78.9768336
## R.ratio.nstopwrds.nwrds 63.6378014 55.8824383
## R.T.cheap 62.1777579 44.2980060
## R.sum.TfIdf 62.1093972 45.0313804
## R.T.friend 62.0619954 31.6373694
## R.T.tri 61.6269833 34.4652260
## R.T.food 60.4651889 52.9449292
## R.T.seattl 59.3238509 35.6773067
## R.T.roll 57.2911371 24.5711925
## R.T.tofu 56.2902664 41.4012469
## R.T.rice 55.6705172 60.7182663
## R.T.bbq 51.6988580 16.4806085
## R.T.ive 51.5467404 29.6509864
## R.T.order 49.1963338 36.0541317
## R.npnct19.log 46.6818734 68.8253565
## R.T.pretti 45.4391869 32.0693406
## R.T.one 44.5314559 40.2779375
## R.T.pork 44.0180925 69.9214630
## R.T.place 41.9281640 44.7149910
## R.T.servic 40.5746306 33.8852363
## R.T.menu 40.3916423 40.3838335
## R.T.time 39.2630311 40.7059783
## R.nwrds.log 38.4495825 16.0287221
## R.T.love 38.0948489 31.6234295
## R.T.tabl 37.8743449 42.9190218
## R.T.happi 37.8723562 24.6272909
## R.T.good 37.8082349 38.8112758
## R.T.restaur 36.9685854 38.8034540
## R.T.alway 36.9492266 30.2606057
## R.nwrds.unq.log 35.3268064 25.4898715
## R.T.fresh 34.1930178 26.8829679
## R.T.best 33.6561292 39.4008782
## review.niso8859.log 32.2958745 23.3887868
## R.T.realli 31.6891565 28.4019116
## R.T.drink 30.6058139 22.5228282
## R.T.can 30.1658680 25.4484740
## R.T.sauc 30.0404279 25.2762115
## R.T.great 29.5534482 42.9594181
## R.T.nice 29.0062734 34.5815621
## R.T.night 28.1817483 27.1912146
## R.npnct04.log 27.8521563 12.8486190
## R.T.fri 27.6661168 25.3880015
## R.T.price 27.5329395 40.2378141
## R.npnct13.log 27.2218975 24.0822878
## R.npnct01.log 27.1122732 20.4208664
## R.npnct11.log 26.8258200 19.8114895
## R.T.tast 26.7182134 26.8616418
## R.nuppr.log 26.5430417 20.8114704
## R.npnct07.log 25.3999403 35.7441725
## R.T.park 25.1221080 31.5755882
## R.T.salad 24.8395623 48.7225772
## R.T.chicken 24.6102709 38.8696419
## R.T.chines 24.5195179 9.5077012
## R.nchrs.log 24.4698101 16.5240877
## R.T.sandwich 23.3680114 17.5672777
## R.T.delici 23.1777526 29.1585977
## R.T.breakfast 22.6899122 15.8135681
## R.T.beer 22.0190978 16.3650168
## R.npnct14.log 21.9373706 11.7492076
## R.T.burger 21.9055347 9.4028868
## R.ndgts.log 20.9795332 23.5583553
## R.npnct08.log 20.0983473 17.8707357
## R.T.lunch 19.8535901 23.8120407
## R.nstopwrds.log 19.8169736 17.1525070
## R.ratio.sum.TfIdf.nwrds 19.4284228 16.2513277
## R.npnct02.log 19.1196937 14.5323465
## R.T.dish 19.1161843 57.0674188
## R.T.bar 18.5872813 17.5493325
## R.T.egg 17.8743005 16.7503516
## R.npnct15.log 17.6629993 21.6705666
## R.T.hour 17.4009869 15.8353105
## R.npnct12.log 17.3685469 19.3124814
## R.T.vietnames 17.2491207 16.9367746
## R.T.chees 16.5754593 21.4997647
## R.npnct05.log 13.6125943 18.8013987
## R.npnct09.log 12.6094553 8.8467477
## R.npnct16.log 12.4578606 14.4455455
## R.T.dim 12.4375623 10.7721673
## R.T.coffe 11.7575129 12.9027876
## R.T.thai 10.8314481 13.3508461
## R.T.soup 10.6342134 16.2998420
## R.npnct10.log 8.9861660 3.9784033
## R.npnct18.log 8.9477566 4.2821133
## R.T.pizza 8.7267748 13.0754000
## R.T.cake 7.5497676 10.0017191
## R.npnct06.log 7.2579223 11.0715889
## R.npnct20.log 7.1816657 1.1877966
## R.T.pho 6.9997138 9.5527083
## R.T.taco 6.5677101 9.8349681
## R.npnct03.log 4.9536167 2.9998291
## R.P.http 2.6886638 2.2426837
## R.npnct30.log 2.0875313 8.9625966
## R.npnct23.log 1.0369798 0.8501092
## R.P.year.colon 0.2572221 0.7676892
## R.npnct21.log 0.0000000 0.0000000
## Final.rf.importance
## R.T.like 60.4500930
## R.T.noodl 100.0000000
## R.T.just 39.9219860
## R.T.get 78.9768336
## R.ratio.nstopwrds.nwrds 55.8824383
## R.T.cheap 44.2980060
## R.sum.TfIdf 45.0313804
## R.T.friend 31.6373694
## R.T.tri 34.4652260
## R.T.food 52.9449292
## R.T.seattl 35.6773067
## R.T.roll 24.5711925
## R.T.tofu 41.4012469
## R.T.rice 60.7182663
## R.T.bbq 16.4806085
## R.T.ive 29.6509864
## R.T.order 36.0541317
## R.npnct19.log 68.8253565
## R.T.pretti 32.0693406
## R.T.one 40.2779375
## R.T.pork 69.9214630
## R.T.place 44.7149910
## R.T.servic 33.8852363
## R.T.menu 40.3838335
## R.T.time 40.7059783
## R.nwrds.log 16.0287221
## R.T.love 31.6234295
## R.T.tabl 42.9190218
## R.T.happi 24.6272909
## R.T.good 38.8112758
## R.T.restaur 38.8034540
## R.T.alway 30.2606057
## R.nwrds.unq.log 25.4898715
## R.T.fresh 26.8829679
## R.T.best 39.4008782
## review.niso8859.log 23.3887868
## R.T.realli 28.4019116
## R.T.drink 22.5228282
## R.T.can 25.4484740
## R.T.sauc 25.2762115
## R.T.great 42.9594181
## R.T.nice 34.5815621
## R.T.night 27.1912146
## R.npnct04.log 12.8486190
## R.T.fri 25.3880015
## R.T.price 40.2378141
## R.npnct13.log 24.0822878
## R.npnct01.log 20.4208664
## R.npnct11.log 19.8114895
## R.T.tast 26.8616418
## R.nuppr.log 20.8114704
## R.npnct07.log 35.7441725
## R.T.park 31.5755882
## R.T.salad 48.7225772
## R.T.chicken 38.8696419
## R.T.chines 9.5077012
## R.nchrs.log 16.5240877
## R.T.sandwich 17.5672777
## R.T.delici 29.1585977
## R.T.breakfast 15.8135681
## R.T.beer 16.3650168
## R.npnct14.log 11.7492076
## R.T.burger 9.4028868
## R.ndgts.log 23.5583553
## R.npnct08.log 17.8707357
## R.T.lunch 23.8120407
## R.nstopwrds.log 17.1525070
## R.ratio.sum.TfIdf.nwrds 16.2513277
## R.npnct02.log 14.5323465
## R.T.dish 57.0674188
## R.T.bar 17.5493325
## R.T.egg 16.7503516
## R.npnct15.log 21.6705666
## R.T.hour 15.8353105
## R.npnct12.log 19.3124814
## R.T.vietnames 16.9367746
## R.T.chees 21.4997647
## R.npnct05.log 18.8013987
## R.npnct09.log 8.8467477
## R.npnct16.log 14.4455455
## R.T.dim 10.7721673
## R.T.coffe 12.9027876
## R.T.thai 13.3508461
## R.T.soup 16.2998420
## R.npnct10.log 3.9784033
## R.npnct18.log 4.2821133
## R.T.pizza 13.0754000
## R.T.cake 10.0017191
## R.npnct06.log 11.0715889
## R.npnct20.log 1.1877966
## R.T.pho 9.5527083
## R.T.taco 9.8349681
## R.npnct03.log 2.9998291
## R.P.http 2.2426837
## R.npnct30.log 8.9625966
## R.npnct23.log 0.8501092
## R.P.year.colon 0.7676892
## R.npnct21.log 0.0000000
# players_df <- data.frame(id=c("Chavez", "Giambi", "Menechino", "Myers", "Pena"),
# OBP=c(0.338, 0.391, 0.369, 0.313, 0.361),
# SLG=c(0.540, 0.450, 0.374, 0.447, 0.500),
# cost=c(1400000, 1065000, 295000, 800000, 300000))
# players_df$RS.predict <- predict(glb_models_lst[[csm_mdl_id]], players_df)
# print(orderBy(~ -RS.predict, players_df))
if (length(diff <- setdiff(names(glb_trnobs_df), names(glb_allobs_df))) > 0)
print(diff)
for (col in setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.src == "Train", col] <- glb_trnobs_df[, col]
if (length(diff <- setdiff(names(glb_fitobs_df), names(glb_allobs_df))) > 0)
print(diff)
if (length(diff <- setdiff(names(glb_OOBobs_df), names(glb_allobs_df))) > 0)
print(diff)
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
if (length(diff <- setdiff(names(glb_newobs_df), names(glb_allobs_df))) > 0)
print(diff)
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "prdnew_dsk.RData"))
rm(submit_df, tmp_OOBobs_df)
## Warning in rm(submit_df, tmp_OOBobs_df): object 'tmp_OOBobs_df' not found
# tmp_replay_lst <- replay.petrisim(pn=glb_analytics_pn,
# replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
# "data.new.prediction")), flip_coord=TRUE)
# print(ggplot.petrinet(tmp_replay_lst[["pn"]]) + coord_flip())
glb_chunks_df <- myadd_chunk(glb_chunks_df, "display.session.info", major.inc=TRUE)
## label step_major step_minor bgn end elapsed
## 16 predict.data.new 9 0 212.356 216.241 3.885
## 17 display.session.info 10 0 216.241 NA NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor bgn end elapsed
## 6 extract.features 3 0 24.022 129.336 105.315
## 11 fit.models 7 1 153.799 180.585 26.786
## 10 fit.models 7 0 132.229 153.799 21.570
## 12 fit.models 7 2 180.586 194.490 13.904
## 14 fit.data.training 8 0 198.621 207.780 9.159
## 2 inspect.data 2 0 13.768 19.736 5.968
## 15 fit.data.training 8 1 207.781 212.355 4.574
## 13 fit.models 7 3 194.491 198.620 4.129
## 16 predict.data.new 9 0 212.356 216.241 3.885
## 4 transform.data 2 2 20.465 23.948 3.483
## 1 import.data 1 0 11.169 13.768 2.599
## 8 select.features 5 0 130.484 131.703 1.219
## 7 cluster.data 4 0 129.337 130.484 1.147
## 3 scrub.data 2 1 19.737 20.465 0.728
## 9 partition.data.training 6 0 131.703 132.228 0.525
## 5 manage.missing.data 2 3 23.948 24.022 0.074
## duration
## 6 105.314
## 11 26.786
## 10 21.570
## 12 13.904
## 14 9.159
## 2 5.968
## 15 4.574
## 13 4.129
## 16 3.885
## 4 3.483
## 1 2.599
## 8 1.219
## 7 1.147
## 3 0.728
## 9 0.525
## 5 0.074
## [1] "Total Elapsed Time: 216.241 secs"
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] tcltk grid parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] gdata_2.16.1 randomForest_4.6-10 arm_1.8-5
## [4] lme4_1.1-7 Rcpp_0.11.6 Matrix_1.2-1
## [7] MASS_7.3-40 rpart.plot_1.5.2 rpart_4.1-9
## [10] ROCR_1.0-7 gplots_2.17.0 caTools_1.17.1
## [13] tm_0.6-1 NLP_0.1-7 stringr_1.0.0
## [16] dplyr_0.4.1 plyr_1.8.2 sqldf_0.4-10
## [19] RSQLite_1.0.0 DBI_0.3.1 gsubfn_0.6-6
## [22] proto_0.3-10 reshape2_1.4.1 doMC_1.3.3
## [25] iterators_1.0.7 foreach_1.4.2 doBy_4.5-13
## [28] survival_2.38-1 caret_6.0-47 ggplot2_1.0.1
## [31] lattice_0.20-31
##
## loaded via a namespace (and not attached):
## [1] class_7.3-12 gtools_3.5.0 assertthat_0.1
## [4] digest_0.6.8 slam_0.1-32 BradleyTerry2_1.0-6
## [7] chron_2.3-45 coda_0.17-1 evaluate_0.7
## [10] e1071_1.6-4 lazyeval_0.1.10 minqa_1.2.4
## [13] SparseM_1.6 car_2.0-25 nloptr_1.0.4
## [16] rmarkdown_0.6.1 labeling_0.3 splines_3.2.0
## [19] munsell_0.4.2 compiler_3.2.0 mgcv_1.8-6
## [22] htmltools_0.2.6 nnet_7.3-9 codetools_0.2-11
## [25] brglm_0.5-9 bitops_1.0-6 nlme_3.1-120
## [28] gtable_0.1.2 magrittr_1.5 formatR_1.2
## [31] scales_0.2.4 KernSmooth_2.23-14 stringi_0.4-1
## [34] RColorBrewer_1.1-2 tools_3.2.0 abind_1.4-3
## [37] pbkrtest_0.4-2 yaml_2.1.13 colorspace_1.2-6
## [40] knitr_1.10.5 quantreg_5.11